Hacer la solicitud POST subsiguiente en sesión no funciona – raspado web

Esto es lo que estoy tratando de hacer: ir aquí , luego presionar “buscar”. Agarra los datos, luego pulsa “siguiente” y sigue pulsando siguiente hasta que te quedes sin páginas. Todo hasta golpear “siguiente” funciona. Aquí está mi código. El formato de r.content es radicalmente diferente las dos veces que lo imprimo, lo que indica que algo diferente está ocurriendo entre las solicitudes GET y POST, aunque quiero un comportamiento muy similar. ¿Por qué podría estar ocurriendo esto?

Lo que me parece extraño es que incluso después de la solicitud POST que parece estar devolviendo el material incorrecto, todavía puedo analizar las URL que necesito, pero no el campo de entrada __EVENTVALIDATION.

El mensaje de error (al final del código) indica que el contenido no incluye estos datos que necesito para realizar una solicitud posterior, pero navegar a la página muestra que sí tienen esos datos y que el formato es muy similar al de primera página.

EDITAR: Lo tengo abierto páginas web basadas en el HTML que está analizando, y algo definitivamente no está bien. ejecutando el siguiente código abrirá esas páginas.

El GET me consigue un sitio web con datos como este:

  <input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="TlIgNH 

Mientras que el POST produce un sitio con todos esos datos en la parte inferior de la página en texto sin formato, así:

 |0|hiddenField|__EVENTTARGET||0|hiddenField|__EVENTARGUMENT||0|hiddenField|_ 

Mal r. Contenido

Buena r. Contenido

 import requests from lxml import html from bs4 import BeautifulSoup page = requests.get('http://search.cpsa.ca/physiciansearch') print('got page!') d = {"ctl00$ctl13": "ctl00$ctl13|ctl00$MainContent$physicianSearchView$btnSearch", "ctl00$MainContent$physicianSearchView$txtLastName": "", 'ctl00$MainContent$physicianSearchView$txtFirstName': "", 'ctl00$MainContent$physicianSearchView$txtCity': "", "__VIEWSTATEENCRYPTED":"", 'ctl00$MainContent$physicianSearchView$txtPostalCode': "", 'ctl00$MainContent$physicianSearchView$rblPractice': "", 'ctl00$MainContent$physicianSearchView$ddDiscipline': "", 'ctl00$MainContent$physicianSearchView$rblGender': "", 'ctl00$MainContent$physicianSearchView$txtPracticeInterests': "", 'ctl00$MainContent$physicianSearchView$ddApprovals': "", 'ctl00$MainContent$physicianSearchView$ddLanguage': "", "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch", "__EVENTARGUMENT": "", 'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=", 'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY", '__ASYNCPOST': 'true'} h ={ "X-MicrosoftAjax":"Delta = true", "X-Requested-With":"XMLHttpRequest", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } urls = [] with requests.session() as s: r = s.get("http://search.cpsa.ca/PhysicianSearch",headers=h) soup = BeautifulSoup(r.content, "lxml") tree = html.fromstring(r.content) html.open_in_browser(tree) ev = soup.select("#__EVENTVALIDATION" )[0]["value"] vs = soup.select("#__VIEWSTATE")[0]["value"] vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] d["__EVENTVALIDATION"] = ev d["__VIEWSTATEGENERATOR"] = vsg d["__VIEWSTATE"] = vs r = s.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h) print('opening in browser') retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href') print(retrievedUrls) for url in retrievedUrls: urls.append(url) endSearch = False while endSearch == False: tree = html.fromstring(r.content) html.open_in_browser(tree) soup = BeautifulSoup(r.content, "lxml") print('soup2:') ## BREAKS HERE ev = soup.select("#__EVENTVALIDATION" )[0]["value"] ## BREAKS HERE, vs = soup.select("#__VIEWSTATE")[0]["value"] vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] d["ctl00$ctl13"] = "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage" d["__EVENTVALIDATION"] = ev d["__EVENTTARGET"] = "" d["__VIEWSTATEGENERATOR"] = vsg d["__VIEWSTATE"] = vs d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1 d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1 d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"] = "Next" r = requests.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h) tree = html.fromstring(r.content) tree = html.fromstring(r.content) retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href') print(urls) print(retrievedUrls) endSearch = True ... Traceback (most recent call last): File "C:\Users\daniel.bak\workspace\Alberta Physician Scraper\main\main.py", line 63, in  ev = soup.select("#__EVENTVALIDATION" )[0]["value"] IndexError: list index out of range 

Bueno, esto casi me volvió loco, pero finalmente está funcionando, debes hacer una solicitud de obtención para obtener un nuevo token de __EVENTVALIDATION para cada publicación:

 import requests from bs4 import BeautifulSoup h = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } "ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch" d = { "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch", "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch", 'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=", 'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY", '__ASYNCPOST': 'true'} nxt_d = { "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager", "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2", "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1", "__ASYNCPOST": "true", "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"} url = "http://search.cpsa.ca/PhysicianSearch" with requests.session() as s: r = s.get(url, headers=h) soup = BeautifulSoup(r.content, "lxml") ev = soup.select("#__EVENTVALIDATION")[0]["value"] vs = soup.select("#__VIEWSTATE")[0]["value"] d["__EVENTVALIDATION"] = ev d["__VIEWSTATE"] = vs r = s.post(url, data=d, headers=h) soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml") ev = soup.select("#__EVENTVALIDATION")[0]["value"] vs = soup.select("#__VIEWSTATE")[0]["value"] nxt_d["__EVENTVALIDATION"] = ev nxt_d["__VIEWSTATE"] = vs r = s.post(url, data=nxt_d, headers=h) 

Si abres la fuente de la última publicación, verás que llegas a la página 2. Necesitamos agregar más lógica para pasar por todas las páginas, la agregaré un poco.

Los params:

 "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2", "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1" 

son la página a la que se debe ir y la página de la que viene, de modo que, después de una obtención, debería ser todo lo que necesita cambiar.

Esto obtendrá todas las páginas, extrayendo la mayoría de los valores mediante progtwigción, probablemente podría extraer más, especialmente con la ayuda de una expresión regular, pero extrae la mayoría sin valores de encoding rígidos:

 from lxml.html import fromstring import requests class Crawler(object): def __init__(self, ua, url): self.user_agent = ua self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua} self.post_data2 = {'__ASYNCPOST': 'true', "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"} self.url = url self.post_data1 = { '__ASYNCPOST': 'true'} def populate(self, xml): """Pulls form post data keys and values for initial post.""" k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0] k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0] self.post_data1[k1.get("name")] = k1.get("value") self.post_data1[k2.get("name")] = k2.get("value") self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0] self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"] def populate2(self, xml): """Pulls form post data keys and values, for all subsequent posts, setting initial page number values. """ data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name") self.pge = data[0] self.ev = data[1] self.post_data2["__EVENTTARGET"] = self.ev self.post_data2[self.ev] = "1" self.post_data2[self.pge] = "2" @staticmethod def put_validation(xml, d): """Need to request new __EVENTVALIDATION for each post. """ ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0] vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0] d["__EVENTVALIDATION"] = ev d["__VIEWSTATE"] = vs def next_page(self, d): """Increments the page number by one per iteration.""" e = self.post_data2[self.ev] v = self.post_data2[self.pge] self.post_data2[self.pge] = str(int(v) + 1) self.post_data2[self.ev] = str(int(e) + 1) def start(self): with requests.session() as s: # get initial page to pull __EVENTVALIDATION etc.. req = s.get(self.url, headers={"user-agent": self.user_agent}).content # add __EVENTVALIDATION" to post data. self.put_validation(fromstring(req), self.post_data1) xml = fromstring(req) # populate the rest of the post data. self.populate(xml) resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content) # yield first page results. yield resp # fill post data for next pages. self.populate2(resp) # when this is an empty list, we will have hit the last page. nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled") while not nxt: # update __EVENTVALIDATION token and _VIEWSTATE. self.put_validation(fromstring(s.get(self.url).content), self.post_data2) # post to get next page of results. yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content) nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled") self.next_page(nxt_d) ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" url = "http://search.cpsa.ca/PhysicianSearch" c = Crawler(ua, url) for tree in c.start(): # use tree