spider如下:
# -*- coding: utf-8 -*-
import scrapy
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
allowed_domains = ['lianjia.com']
start_urls = ['https://bj.lianjia.com/ershoufang/pg{}/'.format(num) for num in range(1,101)]
def parse(self, response):
hrefs=response.xpath('//div[@class="info clear"]/div[@class="title"]/a/@href').extract()
for href in hrefs:
yield scrapy.FormRequest(href,callback=self.parse_info)
def parse_info(self,response):
#print(response.text)
price=response.xpath('//span[@class="total"]/text()').extract_first()
unitPrice=response.xpath('//span[@class="unitPriceValue"]/text()').extract_first()
hu_xing=response.xpath('//div[@class="houseInfo"]/div[@class="room"]/