一、数据爬取
利用python爬取链家网成都市二手房信息,具体使用scrapy框架写爬虫程序爬取信息。
1、spider文件(具体爬虫代码)
import scrapy
import time
import random
from LianJia.items import LianjiaItem
class LianJiaSpider(scrapy.Spider):
name = 'LianJia'
# allowed_domains = ['www.xxx.com']
# 起始url
# start_urls = ['https://cd.lianjia.com/ershoufang/pg1/']
District_list=['jinjiang', 'qingyang', 'wuhou', 'gaoxin7', 'chenghua', 'jinniu', 'tianfuxinqu', 'gaoxinxi1', 'shuangliu'
, 'wenjiang', 'pidou', 'longquanyi', 'xindou', 'tianfuxinqunanqu', 'qingbaijiang', 'doujiangyan', 'pengzhou',
'jianyang', 'xinjin', 'chongzhou1', 'dayi', 'jintang','pujiang','qionglai']
# url模板
url='https://cd.lianjia.com/ershoufang/{}/pg{}/'
def start_requests(self):
for district in self.District_list:
for i in range(1,3):
url=self.url.format(district,i)
yield scrapy.Request(url=url,callback=self.parse_detail_url)
# 通过详情页url解析获取房子详细信息
def parse_detail_page(self, response):
info={
}
item = LianjiaItem()
try:
item['title'] = response.xpath('.//div[@class="title"]/h1/text()').extract_first()
item['total_price'] = response.xpath('/html/body/div[5]/div[2]/div[3]/span//text()').extract_first()
item['price'] = response.xpath('/html/body/div[5]/div[2]/div[3]/div[1]/div[1]/span//text()').extract_first()
item['build_time'] = response.xpath('/html/body/div[5]/div[2]/div[4]/div[3]/div[2]/text()').extract_first()
item['community_name'] = response.xpath('/html/body/div[5]/div[2]/div[5]/div[1]/a[1]/text()').extract_first()
item['district'] = response.xpath('/html/body/div[5]/div[2]/div[5]/div[2]/span[2]//text()').extract_first()
item['number'] = response.xpath(