Chatarra una página para obtener un enlace javascript codificado

Estoy trabajando en una tarea en la que tengo que obtener detalles de una página web utilizando la biblioteca de solicitudes de Python. He obtenido con éxito los detalles de la página, pero tiene un botón “Mostrar detalles” que obtiene más detalles usando la llamada ajax, ahora también necesito obtener esos detalles adicionales. ¿Puede alguien ayudarme a hacerlo? Aquí está el enlace al sitio web: – http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx y una captura de pantalla del sitio web aquí:
introduzca la descripción de la imagen aquí

Aquí está mi código que he hecho.

from bs4 import BeautifulSoup import requests,json def returnJson(wordmark, page_class): url = "http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx" r_init = requests.get(url) soup = BeautifulSoup(r_init.text, 'html.parser') event_validation = soup.find("input", attrs={"name" : "__EVENTVALIDATION"})['value'] view_state = soup.find("input", attrs={"name" : "__VIEWSTATE"})['value'] search_type = 'WM' postdata = { 'ctl00$ContentPlaceHolder1$DDLFilter' : '0', 'ctl00$ContentPlaceHolder1$DDLSearchType' : search_type, 'ctl00$ContentPlaceHolder1$TBWordmark' : wordmark, 'ctl00$ContentPlaceHolder1$TBClass' : page_class, '__EVENTVALIDATION' : event_validation, "__EVENTTARGET" : "ctl00$ContentPlaceHolder1$BtnSearch", "__VIEWSTATE" : view_state, } r = requests.post(url, data=postdata) return r def scrapping(r): soup = BeautifulSoup(r.text, 'html.parser') counter=len(soup.findAll('tr',attrs={'class':'row'})) counter+=len(soup.findAll('tr',attrs={'class':'alt'})) wordmark_idvalue='ContentPlaceHolder1_MGVSearchResult_lblsimiliarmark_' proprietor_idvalue='ContentPlaceHolder1_MGVSearchResult_LblVProprietorName_' applicationno_idvalue='ContentPlaceHolder1_MGVSearchResult_lblapplicationnumber_' class_idvalue='ContentPlaceHolder1_MGVSearchResult_lblsearchclass_' status_idvalue='ContentPlaceHolder1_MGVSearchResult_Label6_' words_list=[] for i in range(0,counter): words_dict={} row=soup.find('span',attrs={'id':(wordmark_idvalue+str(i))}) words_dict['Wordmark']=row.text row=soup.find('span',attrs={'id':(proprietor_idvalue+str(i))}) words_dict['Proprietor']=row.text row=soup.find('span',attrs={'id':(applicationno_idvalue+str(i))}) words_dict['Application Number']=row.text row=soup.find('span',attrs={'id':(class_idvalue+str(i))}) words_dict['Class ']=row.text row=soup.find('span',attrs={'id':(status_idvalue+str(i))}) words_dict['Status']=row.text words_list.append(words_dict) return words_list def showDetails(wordmark, page_class): if(len(wordmark)>2 and page_class.isalnum()==1): var=json.dumps(scrapping(returnJson(wordmark, page_class))) return var else: print("Please Enter Valid Parameters\n") showDetails('AIWA','2') 

Debe crear otra solicitud POST utilizando la información de la primera solicitud POST. A continuación se muestra cómo se puede extraer la Goods & Services Description de los datos devueltos:

 from operator import itemgetter from bs4 import BeautifulSoup import requests,json headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'} def get_input_value(soup, name): return soup.find("input", attrs={"name" : name})['value'] def returnJson(wordmark, page_class): url = "http://ipindiaonline.gov.in/tmrpublicsearch/frmmain.aspx" r_init = requests.get(url) soup = BeautifulSoup(r_init.text, 'html.parser') postdata = { "ctl00$ContentPlaceHolder1$DDLFilter" : "0", "ctl00$ContentPlaceHolder1$DDLSearchType" : "WM", "ctl00$ContentPlaceHolder1$TBWordmark" : wordmark, "ctl00$ContentPlaceHolder1$TBClass" : page_class, "__EVENTVALIDATION" : get_input_value(soup, "__EVENTVALIDATION"), "__EVENTTARGET" : "ctl00$ContentPlaceHolder1$BtnSearch", "__VIEWSTATE" : get_input_value(soup, "__VIEWSTATE"), } r = requests.post(url, headers=headers, data=postdata) return r def scrapping(r): soup = BeautifulSoup(r.content, 'html.parser') counter = len(soup.find_all('tr', attrs={'class':'row'})) counter += len(soup.find_all('tr', attrs={'class':'alt'})) words_list = [] fields = [ ("Wordmark", "ContentPlaceHolder1_MGVSearchResult_lblsimiliarmark_{}"), ("Proprietor", "ContentPlaceHolder1_MGVSearchResult_LblVProprietorName_{}"), ("Application Number", "ContentPlaceHolder1_MGVSearchResult_lblapplicationnumber_{}"), ("Class", "ContentPlaceHolder1_MGVSearchResult_lblsearchclass_{}"), ("Status", "ContentPlaceHolder1_MGVSearchResult_Label6_{}"), ] for index in range(0, counter): words_dict = {} for key, field in fields: words_dict[key] = soup.find('span', attrs={'id' : field.format(index)}).text print("Wordmark: {}".format(words_dict["Wordmark"])) # Construct a POST request for the Show Details panel # Locate matching 'Show details' link span = soup.find('span', attrs={'id' : fields[0][1].format(index)}) a = span.find_next('a', class_='LnkshowDetails') lnk_show_details = a['href'].split("'")[1] data = { "__EVENTTARGET" : lnk_show_details, "__VIEWSTATE" : get_input_value(soup, "__VIEWSTATE"), "__VIEWSTATEENCRYPTED" : "", "__EVENTVALIDATION" : get_input_value(soup, "__EVENTVALIDATION"), "__ASYNCPOST" : "true", } url = "http://ipindiaonline.gov.in/tmrpublicsearch" + soup.form["action"].strip(".") r_details = requests.post(url, headers=headers, data=data) html = b''.join(itemgetter(7, 8)(r_details.content.split(b"|"))) soup_details = BeautifulSoup(html, "html.parser") details = {} for tr in soup_details.find_all('tr'): row = [td.text for td in tr.find_all('td')] # Note: Journal No and Used since would need more work details[row[0]] = row[1] # Copy description desc = 'Goods & Services Description' words_dict[desc] = details[desc] words_list.append(words_dict) return words_list def showDetails(wordmark, page_class): if len(wordmark) > 2 and page_class.isalnum() == 1: var = json.dumps(scrapping(returnJson(wordmark, page_class))) return var else: print("Please Enter Valid Parameters\n") print(showDetails('AIWA','2')) 

Esto mostraría:

 Wordmark: AIWA Wordmark: AIWACEM Wordmark: AIWAPRIME (LABEL) [{"Wordmark": "AIWA", "Proprietor": "AIWA CO. LTD.", "Application Number": "683935", "Class": "2", "Status": "Registered", "Goods & Services Description": "PAINTS, VARNISHES, LACQUERS, PRESERVATIVES AGAINST RUST AND AGAINST DESTRIORATION OF WOOD, COLOURING MATTERS, DYESTUFFS, MORDANTS, NATURAL RESINS, METALS IN FOIL AND POWDER FROM FOR PAINTERS AND DECORATORS."}, {"Wordmark": "AIWACEM ", "Proprietor": "AMITA B. MEHTA", "Application Number": "1108415", "Class": "2", "Status": "Registered", "Goods & Services Description": "waterproof cement paint."}, {"Wordmark": "AIWAPRIME (LABEL)", "Proprietor": "AMITA B. MEHTA", "Application Number": "1165809", "Class": "2", "Status": "Registered", "Goods & Services Description": "WATER BASED CEMENT PRIMER INCLUDED IN CLASS 2."}] 

Nota: Los datos devueltos contienen otros campos que están separados por | personaje. El HTML para los detalles también contiene este carácter, por lo que es necesario extraer los campos 7 y 8 para obtener solo el HTML.

No intentaría esto utilizando BeautifulSoup. Sin embargo, creo que podría hacerlo con Selenium ( consulte https://selenium-python.readthedocs.io/ .)

Usando sus instalaciones, puede hacer clic en el botón ‘Mostrar detalles’ de su elección, luego esperar a que aparezca la información solicitada en el panel derecho y luego recoger la información solicitada más o menos como lo haría usando BeautifulSoup desde ese panel.