start_urls = ['https://bj.lianjia.com/ershoufang/pg{}/'.format(i) for i in range(1, 3)],这个网站url可以放置列表,所以这里面可以放循环遍历的url,range(1,3),例子上只找2页数据,多个数据也行,看你们的情况,还有就是爬取的过多可能封ip,所以代理ip是不可避免的。
代码链接:https://pan.baidu.com/s/1REdbtZBCirpZlGqU5FAD1A
提取码:ib83
微信扫描版,
代码里面没有需要的包,需要自己单独引入,里面没有相应的三方库文件。(博客上所有的代码)
spiders.py
import scrapy
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
allowed_domains = ['lianjia.com']
start_urls = ['https://bj.lianjia.com/ershoufang/pg{}/'.format(i) for i in range(1, 3)]
def parse(self, response):
urls = response.xpath('//div[@class="info clear"]/div[@class="title"]/a/@href').extract()
for url in urls:
yield scrapy.Request(url, callback=self.parse_index)
def parse_index(self, response):
jia_ge = response.xpath('concat(//span[@class ="total"]/text(),//span[@class="unit"]/span/text())').extract_first()
jun_jia = response.xpath('//span[@class="unitPriceValue"]/text()').extract_first()+'元/每平方'
xiao_qu = response.xpath('//div[@class="communityName"]/a[1]/text()').extract_first()
address = response.xpath('string(//div[@class="areaName"]/span[@class="info"])').extract_first()
hu_xing = response.xpath('//div[@class="base"]//ul[1]/li[1]/text()').extract_first()
lou_ceng = response.xpath('//div[@class="base"]//ul/li[2]/text()').extract_first()
jianzhu_mianji = response.xpath('//div[@class="base"]//ul/li[3]/text()').extract_first()
huxing_jiegou = response.xpath('//div[@class="base"]//ul/li[4]/text()').extract_first()
taonei_mianji = response.xpath('//div[@class="base"]//ul/li[5]/text()').extract_first()
jianzhu_leixing = response.xpath('//div[@class="base"]//ul/li[6]/text()').extract_first()
fangwu_chaoxiang = response.xpath('//div[@class="base"]//ul/li[7]/text()').extract_first()
jianzhu_jiegou = response.xpath('//div[@class="base"]//ul/li[8]/text()').extract_first()
zhuangxiu_qingkuang = response.xpath('//div[@class="base"]//ul/li[9]/text()').extract_first()
tihu_bili = response.xpath('//div[@class="base"]//ul/li[10]/text()').extract_first()
gongnuan_fangshi = response.xpath('//div[@class="base"]//ul/li[11]/text()').extract_first()
peibei_dianti = response.xpath('//div[@class="base"]//ul/li[12]/text()').extract_first()
chanquan_nianxian = response.xpath('//div[@class="base"]//ul/li[13]/text()').extract_first()
guapai_shijian = response.xpath('//div[@class="content"]/ul/li[1]/span[2]/text()').extract_first()
jiaoyi_quanshu = response.xpath('//div[@class="content"]/ul/li[2]/span[2]/text()').extract_first()
shangci_jiaoyi = response.xpath('//div[@class="content"]/ul/li[3]/span[2]/text()').extract_first()
fangwu_yongtu = response.xpath('//div[@class="content"]/ul/li[4]/span[2]/text()').extract_first()
fangwu_nianxian = response.xpath('//div[@class="content"]/ul/li[5]/span[2]/text()').extract_first()
chanquan_suoshu = response.xpath('//div[@class="content"]/ul/li[6]/span[2]/text()').extract_first()
diya_xinxi = response.xpath('//div[@class="content"]/ul/li[7]/span[2]/text()').extract_first().strip()
fangben_beijian = response.xpath('//div[@class="content"]/ul/li[8]/span[2]/text()').extract_first()
yield {
"价格":jia_ge,
"均价":jun_jia,
"小区":xiao_qu,
"地址":address,
"户型":hu_xing,
"楼层":lou_ceng,
"建筑面积":jianzhu_mianji,
"户型结构":huxing_jiegou,
"套内面积":taonei_mianji,
"建筑类型":jianzhu_leixing,
"房屋朝向":fangwu_chaoxiang,
"建筑结构":jianzhu_jiegou,
"装修情况":zhuangxiu_qingkuang,
"梯户比例":tihu_bili,
"供暖方式":gongnuan_fangshi,
"配备电梯":peibei_dianti,
"产权年限":chanquan_nianxian,
"挂牌时间":guapai_shijian,
"交易权属":jiaoyi_quanshu,
"上次交易":shangci_jiaoyi,
"房屋用途":fangwu_yongtu,
"房屋年限":fangwu_nianxian,
"产权所属":chanquan_suoshu,
"抵押信息":diya_xinxi,
"房本备件":fangben_beijian,
}
pipelines.py ---》放到MongoDB二选一
import pymongo
class MongoPipeline(object):
def open_spider(self,spider):
self.client = pymongo.MongoClient()
def process_item(self, item, spider):
self.client.room.lianjia.insert(item)
return item
def close_spider(self,spider):
self.client.close()
pipelines.py---》放到MySQL二选一,注意MySQL连接里面的东西,像host,port,user,password,db,charset
from pymysql import connect
class MySQLPipeline(object):
def open_spider(self, spider):
self.client = connect(host="localhost",port=3306,user='root',password='123456',db='room',charset='utf8')
self.cursor = self.client.cursor()
def process_item(self, item, spider):
args = [
item["jia_ge"],
item["jun_jia"],
item["xiao_qu"],
item["address"],
item["hu_xing"],
item["lou_ceng"],
item["jianzhu_mianji"],
item["huxing_jiegou"],
item["taonei_mianji"],
item["jianzhu_liexing"],
item["fangwu_chaoxiang"],
item["jianzhu_jiegou"],
item["zhuangxiu_qingkuang"],
item["tihu_bili"],
item["gongnuan_fangshi"],
item["peibei_dianti"],
item["chanquan_nianxian"],
item["guapai_shijian"],
item["jiaoyi_quanshu"],
item["shangci_jiaoyi"],
item["fangwu_yongtu"],
item["fangwu_nianxian"],
item["chanquan_suoshu"],
item["diya_xinxi"],
item["fangben_beijian"]
]
sql = 'insert into lianjia VALUES (0,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql,args)
self.client.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.client.close()
middleware.py,我构造了一个随机请求头基本的防封手段。
from scrapy import signals
from fake_useragent import UserAgent
class UserAgentMiddleware(object):
def process_request(self, request, spider):
request.headers.setdefault(b'User-Agent', UserAgent().random)
settings.py,里面的设置
# -*- coding: utf-8 -*-
BOT_NAME = 'room'
SPIDER_MODULES = ['room.spiders']
NEWSPIDER_MODULE = 'room.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DOWNLOADER_MIDDLEWARES = {
'room.middlewares.UserAgentMiddleware': 120,
}
ITEM_PIPELINES = {
'room.pipelines.MongoPipeline': 300,
# 'room.pipelines.MySQLPipeline': 300,
}