Algunos de estos nodos de texto tienen una representación de cadena que es todo espacio en blanco, así que elimínelos y busque las palabras clave “Horario de apertura” y “Teléfono” para procesar las líneas en un bucle:
from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector import re from todo.items import wendyItem class wendySpider(BaseSpider): name = "wendyspider" allowed_domains = ["wendys.com.sg"] start_urls = ["http://www.wendys.com.sg/outlets.php"] def parse(self, response): hxs = HtmlXPathSelector(response) cells = hxs.select('//div[@id="menu_list"]//td[@valign="top"][.//span[@class="foodTitle"]]') items = [] for cell in cells: item = wendyItem() # get all text nodes # some lines are blank so .strip() them lines = cell.select('.//text()').extract() lines = [l.strip() for l in lines if l.strip()] # first non-blank line is the place name item['name'] = lines.pop(0) # for the other lines, check for "Opening hours" and "Telephone" # to store lines in correct list container address_lines = [] hours_lines = [] telephone_lines = [] opening_hours = False telephone = False for line in lines: if 'Opening Hours' in line: opening_hours = True elif 'Telephone' in line: telephone = True if telephone: telephone_lines.append(line) elif opening_hours: hours_lines.append(line) else: address_lines.append(line) # last address line is the postal code + town name item['address'] = "\n".join(address_lines[:-1]) item['postal'] = address_lines[-1] # ommit "Opening hours" (first element in list) item['hours'] = "\n".join(hours_lines[1:]) item['contact'] = "\n".join(telephone_lines) items.append(item) return items