If you look at the data posted to the form in chrome or firebug you can see there are many fields passed in the post request, there are a few that are essential and must be parsed from the original page, parsing the ids from the div.sangria ul li a
tags is not sufficient as the actual data posted is slightly different, what is posted is in the Javascript function, WebForm_DoPostBackWithOptions
which is in the href not the id attribute:
href='javascript:WebForm_DoPostBackWithOptions(new
WebForm_PostBackOptions("ctl00$m$g_36ea0310_893d_4a19_9ed1_88a133d06423$ctl00$Repeater1$ctl03$lnk_Grupo", "", true, "", "", false, true))'>
Sometimes all the underscores are replaced with dollar signs so it is easy to do a str.replace to get them in the correct order but not really in this case, we could use a regex to parse but I like the js2xml lib which can parse a javascript function and its args into an xml tree.
The following code using requests shows you how can get the data from the initial request and get to all the pages you want:
import requests
from lxml import html
import js2xml
post = "http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx"
def validate(xml):
# these fields are the minimum required as cannot be hardcoded
data = {"__VIEWSTATEGENERATOR": xml.xpath("//*[@id='__VIEWSTATEGENERATOR']/@value")[0],
"__EVENTVALIDATION": xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0],
"__VIEWSTATE": xml.xpath("//*[@id='__VIEWSTATE']/@value")[0],
" __REQUESTDIGEST": xml.xpath("//*[@id='__REQUESTDIGEST']/@value")[0]}
return data
with requests.Session() as s:
# make initial requests to get the links/hrefs and the from fields
r = s.get(
"http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx")
xml = html.fromstring(r.content)
hrefs = xml.xpath("//*[@id='moduloBusqueda']//div[@class='sangria']/ul/li/a/@href")
form_data = validate(xml)
for h in hrefs:
js_xml = js2xml.parse(h)
_id = js_xml.xpath(
"//identifier[@name='WebForm_PostBackOptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[
0]
form_data["__EVENTTARGET"] = _id.text
r = s.post(post, data=form_data)
xml = html.fromstring(r.content)
print(xml.xpath("//ul[@class='listadoVert02']/ul/li/a/text()"))
If we run the code above we see the different text output from all teh anchor tags:
In [2]: with requests.Session() as s:
...: r = s.get(
...: "http://www.asambleamadrid.es/ES/QueEsLaAsamblea/ComposiciondelaAsamblea/LosDiputados/Paginas/RelacionAlfabeticaDiputados.aspx")
...: xml = html.fromstring(r.content)
...: hrefs = xml.xpath("//*[@id='moduloBusqueda']//div[@class='sangria']/ul/li/a/@href")
...: form_data = validate(xml)
...: for h in hrefs:
...: js_xml = js2xml.parse(h)
...: _id = js_xml.xpath(
...: "//identifier[@name='WebForm_PostBackOptions']/following-sibling::arguments/string[starts-with(.,'ctl')]")[
...: 0]
...: form_data["__EVENTTARGET"] = _id.text
...: r = s.post(post, data=form_data)
...: xml = html.fromstring(r.content)
...: print(xml.xpath("//ul[@class='listadoVert02']/ul/li/a/text()"))
...:
[u'Aboxedn Aboxedn, Sonsoles Trinidad', u'Adrados Gautier, Mxaa Paloma', u'Aguado Del Olmo, Mxaa Josefa', u'xc1lvarez Padilla, Mxaa Nadia', u'Arribas Del Barrio, Josxe9 Mxaa', u'Ballarxedn Valcxe1rcel, xc1lvaro Cxe9sar', u'Berrio Fernxe1ndez-Caballero, Mxaa Inxe9s', u'Berzal Andrade, Josxe9 Manuel', u'Camxedns Martxednez, Ana', u'Carballedo Berlanga, Mxaa Eugenia', 'Cifuentes Cuencas, Cristina', u'Dxedaz Ayuso, Isabel Natividad', u'Escudero Dxedaz-Tejeiro, Marta', u'Fermosel Dxedaz, Jesxfas', u'Fernxe1ndez-Quejo Del Pozo, Josxe9 Luis', u'Garcxeda De Vinuesa Gardoqui, Ignacio', u'Garcxeda Martxedn, Marxeda Begoxf1a', u'Garrido Garcxeda, xc1ngel', u'Gxf3mez Ruiz, Jesxfas', u'Gxf3mez-Angulo Rodrxedguez, Juan Antonio', u'Gonzxe1lez Gonzxe1lez, Isabel Gema', u'Gonzxe1lez Jimxe9nez, Bartolomxe9', u'Gonzxe1lez Taboada, Jaime', u'Gonzxe1lez-Moxf1ux Vxe1zquez, Elena', u'Gonzalo Lxf3pez, Rosalxeda', 'Izquierdo Torres, Carlos', u'Lixe9bana Montijano, Pilar', u'Marixf1o Ortega, Ana Isabel', u'Moraga Valiente, xc1lvaro', u'Muxf1oz Abrines, Pedro', u'Nxfaxf1ez Guijarro, Josxe9 Enrique', u'Olmo Flxf3rez, Luis Del', u'Ongil Cores, Mxaa Gador', 'Ortiz Espejo, Daniel', u'Ossorio Crespo, Enrique Matxedas', 'Peral Guerra, Luis', u'Pxe9rez Baos, Ana Isabel', u'Pxe9rez Garcxeda, David', u'Plaxf1iol De Lacalle, Regina Mxaa', u'Redondo Alcaide, Mxaa Isabel', u'Rollxe1n Ojeda, Pedro', u'Sxe1nchez Fernxe1ndez, Alejandro', 'Sanjuanbenito Bonal, Diego', u'Serrano Guio, Josxe9 Tomxe1s', u'Serrano Sxe1nchez-Capuchino, Alfonso Carlos', 'Soler-Espiauba Gallo, Juan', 'Toledo Moreno, Lucila', 'Van-Halen Acedo, Juan']
[u'Andaluz Andaluz, Mxaa Isabel', u'Ardid Jimxe9nez, Mxaa Isabel', u'Carazo Gxf3mez, Mxf3nica', u'Casares Dxedaz, Mxaa Lucxeda Inmaculada', u'Cepeda Garcxeda De Lexf3n, Josxe9 Carmelo', 'Cruz Torrijos, Diego', u'Delgado Gxf3mez, Carla', u'Franco Pardo, Josxe9 Manuel', u'Freire Campo, Josxe9 Manuel', u'Gabilondo Pujol, xc1ngel', 'Gallizo Llamas, Mercedes', u"Garcxeda D'Atri, Ana", u'Garcxeda-Rojo Garrido, Pedro Pablo', u'Gxf3mez Montoya, Rafael', u'Gxf3mez-Chamorro Torres, Josxe9 xc1ngel', u'Gonzxe1lez Gonzxe1lez, Mxf3nica Silvana', u'Leal Fernxe1ndez, Mxaa Isaura', u'Llop Cuenca, Mxaa Pilar', 'Lobato Gandarias, Juan', u'Lxf3pez Ruiz, Mxaa Carmen', u'Manguan Valderrama, Eva Mxaa', u'Maroto Illera, Mxaa Reyes', u'Martxednez Ten, Carmen', u'Mena Romero, Mxaa Carmen', u'Moreno Navarro, Juan Josxe9', u'Moya Nieto, Encarnacixf3n', 'Navarro Lanchas, Josefa', 'Nolla Estrada, Modesto', 'Pardo Ortiz, Josefa Dolores', u'Quintana Viar, Josxe9', u'Rico Garcxeda-Hierro, Enrique', u'Rodrxedguez Garcxeda, Nicolxe1s', u'Sxe1nchez Acera, Pilar', u'Santxedn Fernxe1ndez, Pedro', 'Segovia Noriega, Juan', 'Vicente Viondi, Daniel', u'Vinagre Alcxe1zar, Agustxedn']
['Abasolo Pozas, Olga', 'Ardanuy Pizarro, Miguel', u'Beirak Ulanosky, Jazmxedn', u'Camargo Fernxe1ndez, Raxfal', 'Candela Pokorna, Marco', 'Delgado Orgaz, Emilio', u'Dxedaz Romxe1n, Laura', u'Espinar Merino, Ramxf3n', u'Espinosa De La Llave, Marxeda', u'Fernxe1ndez Rubixf1o, Eduardo', u'Garcxeda Gxf3mez, Mxf3nica', 'Gimeno Reinoso, Beatriz', u'Gutixe9rrez Benito, Eduardo', 'Huerta Bravo, Raquel', u'Lxf3pez Hernxe1ndez, Isidro', u'Lxf3pez Rodrigo, Josxe9 Manuel', u'Martxednez Abarca, Hugo', u'Morano Gonzxe1lez, Jacinto', u'Ongil Lxf3pez, Miguel', 'Padilla Estrada, Pablo', u'Ruiz-Huerta Garcxeda De Viedma, Lorena', 'Salazar-Alonso Revuelta, Cecilia', u'San Josxe9 Pxe9rez, Carmen', u'Sxe1nchez Pxe9rez, Alejandro', u'Serra Sxe1nchez, Isabel', u'Serra Sxe1nchez, Clara', 'Sevillano De Las Heras, Elena']
[u'Aguado Crespo, Ignacio Jesxfas', u'xc1lvarez Cabo, Daniel', u'Gonzxe1lez Pastor, Dolores', u'Iglesia Vicente, Mxaa Teresa De La', 'Lara Casanova, Francisco', u'Marbxe1n De Frutos, Marta', u'Marcos Arias, Tomxe1s', u'Megxedas Morales, Jesxfas Ricardo', u'Nxfaxf1ez Sxe1nchez, Roberto', 'Reyero Zubiri, Alberto', u'Rodrxedguez Durxe1n, Ana', u'Rubio Ruiz, Juan Ramxf3n', u'Ruiz Fernxe1ndez, Esther', u'Solxeds Pxe9rez, Susana', 'Trinidad Martos, Juan', 'Veloso Lozano, Enrique', u'Zafra Hernxe1ndez, Cxe9sar']
You can add the exact same logic to your spider, I just used requests to show you a working example. You should also be aware that not every asp.net site behaves the same, you may have to re-validate for every post as in this related answer.