spider的代码
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from fake_useragent import UserAgent
from scrapy.linkextractors import LinkExtractor
from lianjia.items import LianjiaItem
class RsfjySpider(CrawlSpider):
name = 'rsfjy'
allowed_domains = ['bj.lianjia.com']
start_urls = ['https://bj.lianjia.com/chengjiao']
ua = UserAgent()# 设置随机请求头
# 得到北京各区的初始链接
rules = (
Rule(LinkExtractor(restrict_xpaths=['//div[@data-role="ershoufang"]//a']), follow=True, callback='all_links', ),
)
# 分析详情页 处理数据并保存
def parse_info(self, response):
item = LianjiaItem()
c_title = response.xpath('/html/body/div[4]/div/text()').get().split()[0] # 小区名 title
h_type = response.xpath('/html/body/div[4]/div/text()').get().split()[1] # 户型
p_square = response.xpath('/html/body/div[4]/div/text()').get().split()[2] # 平米数
c_time = response.xpath('/html/body/div[4]/div/span/text()').get() # 成交时间
# title = response.xpath('/html/body/div[4]/div/h1/text()').get()
c_price = response.xpath("/html/body/section[1]/div[2]/div[2]/div[1]/span/i/text()").get() # 成交价格
g_price = response.xpath("/html/body/section[1]/div[2]/div[2]/div[3]/span[1]/label/text()").get() # 挂牌价格
c_cycle = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[2]/label/text()').get() # 成交周期
t_frequency = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[3]/label/text()').get() # 调价次数
watch_num = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[4]/label/text()').get() # 带看次数
focus_num = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[5]/label/text()').get() # 关注人数
l_browse = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[6]/label/text()').get() # 浏览次数
item['c_title'] = c_title
item['h_type'] = h_type
item['p_square'] = p_square
item['c_time'] = c_time
item['c_price'] = c_price
item['g_price'] = g_price
item['c_cycle'] = c_cycle
item['t_frequency'] = t_frequency
item['watch_num'] = watch_num
item['focus_num'] = focus_num
item['l_browse'] = l_browse
print(item)
yield item
# 得到所有的列表页 然后去访问每套房子的详情页
def parse_item(self, response):
info_ilst = response.xpath('//ul[@class="listContent"]/li')
for info in info_ilst:
title = info.xpath("div[@class='info']/div[@class='title']/a/text()").get()
link = info.xpath("div[@class='info']/div[@class='title']/a/@href ").get()
print(title, link)
yield scrapy.Request(url=response.urljoin(link), callback=self.parse_info,
headers={'User-Agent': self.ua.random}, )
# 翻页 得到北京所有交易的列表页
def next_page(self, response):
page_url = response.xpath('//@page-url').extract_first()
page_data = response.xpath('//@page-data').extract_first()
total_page = eval(page_data)['totalPage']
# total_page = 2
for page in range(1, total_page + 1):
rel_url = page_url.format(page=page)
# print(rel_url)
yield scrapy.Request(url=response.urljoin(rel_url), callback=self.parse_item,
headers={'User-Agent': self.ua.random}, )
def all_links(self, response):
links = response.xpath('//div[@data-role="ershoufang"]//a/@href').getall()
for url in links:
yield scrapy.Request(url=response.urljoin(url), callback=self.next_page,
headers={'User-Agent': self.ua.random}, )
seting 设置
from fake_useragent import UserAgent
import random
import time
BOT_NAME = 'lianjia'
SPIDER_MODULES = ['lianjia.spiders']
NEWSPIDER_MODULE = 'lianjia.spiders'
ua = UserAgent()
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'lianjia (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 开启日志
LOG_ENABLED = True
# 日志名称及路径
LOG_FILE = 'lianjia.log'
# 设置日志编码
LOG_ENCODING = 'utf-8'
# 设置日志等级 警告级以上写入
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = time.sleep(random.random() * 10)# 随机休息防止反爬
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'UserAgent': ua.random,
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
# # 'lianjia.middlewares.ProxyMiddleware':543
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'lianjia.pipelines.LianjiaPipeline': 300,
}
items设置
import scrapy
class LianjiaItem(scrapy.Item):
c_title = scrapy.Field() # 小区名 title
h_type = scrapy.Field() # 户型
p_square =scrapy.Field()# 平米
c_time = scrapy.Field()# 成交时间
c_price = scrapy.Field()# 成交价格
g_price = scrapy.Field()# 挂牌价格
c_cycle = scrapy.Field()# 成交周期
t_frequency = scrapy.Field()# 调价次数
focus_num = scrapy.Field()# 关注人数
watch_num = scrapy.Field()# 带看次数
l_browse = scrapy.Field()# 浏览次数
# price = scrapy.Field()
# average_price = scrapy.Field()
# link = scrapy.Field()
管道设置
from scrapy.exporters import JsonLinesItemExporter
class LianjiaPipeline:
# 初始化创建文件
def __init__(self):
self.file = open('lianjia.json', 'wb', )
self.exproter = JsonLinesItemExporter(self.file, ensure_ascii=False, encoding='utf-8')
# 写入数据
def process_item(self, item, spider):
self.exproter.export_item(item, )
return item
# 自动关闭文件
def close_item(self, spider):
self.file.close()
数据预览