1、spider文件
import scrapy
import time
import random
from LianJia.items import LianjiaItem
class LianJiaSpider(scrapy.Spider):
name = 'LianJia'
# allowed_domains = ['www.xxx.com']
# 起始url
# start_urls = ['https://cd.lianjia.com/ershoufang/pg1/']
District_list=['jinjiang', 'qingyang', 'wuhou', 'gaoxin7', 'chenghua', 'jinniu', 'tianfuxinqu', 'gaoxinxi1', 'shuangliu'
, 'wenjiang', 'pidou', 'longquanyi', 'xindou', 'tianfuxinqunanqu', 'qingbaijiang', 'doujiangyan', 'pengzhou',
'jianyang', 'xinjin', 'chongzhou1', 'dayi', 'jintang','pujiang','qionglai']
# url模板
url='https://cd.lianjia.com/ershoufang/{}/pg{}/'
def start_requests(self):
for district in self.District_list:
for i in range(1,3):
url=self.url.format(district,i)
yield scrapy.Request(url=url,callback=self.parse_detail_url)
# 通过详情页url解析获取房子详细信息
def parse_detail_page(self, response):
info={
}
item = LianjiaItem()
try:
item['title'] = response.xpath('.//div[@class="title"]/h1/text()').extract_first()
item['total_price'] =