scarpy-爬取链家所有成交数据

最新推荐文章于 2024-08-13 21:38:27 发布

难为知己~难为敌

最新推荐文章于 2024-08-13 21:38:27 发布

阅读量442

点赞数

本文链接：https://blog.csdn.net/weixin_40594668/article/details/107516854

版权

本文介绍了一款针对链家网站的房产信息爬虫，详细展示了如何使用Scrapy框架抓取北京地区的二手房成交数据，包括小区名称、户型、面积、成交时间、成交价格等关键信息，并通过JSON格式保存。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

spider的代码

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from fake_useragent import UserAgent
from scrapy.linkextractors import LinkExtractor
from lianjia.items import LianjiaItem


class RsfjySpider(CrawlSpider):
    name = 'rsfjy'
    allowed_domains = ['bj.lianjia.com']
    start_urls = ['https://bj.lianjia.com/chengjiao']
    ua = UserAgent()# 设置随机请求头
    # 得到北京各区的初始链接
    rules = (
        Rule(LinkExtractor(restrict_xpaths=['//div[@data-role="ershoufang"]//a']), follow=True, callback='all_links', ),
    )

    # 分析详情页 处理数据并保存
    def parse_info(self, response):
        item = LianjiaItem()
        c_title = response.xpath('/html/body/div[4]/div/text()').get().split()[0]  # 小区名  title
        h_type = response.xpath('/html/body/div[4]/div/text()').get().split()[1]  # 户型
        p_square = response.xpath('/html/body/div[4]/div/text()').get().split()[2]  # 平米数
        c_time = response.xpath('/html/body/div[4]/div/span/text()').get()  # 成交时间
        # title = response.xpath('/html/body/div[4]/div/h1/text()').get()
        c_price = response.xpath("/html/body/section[1]/div[2]/div[2]/div[1]/span/i/text()").get()  # 成交价格
        g_price = response.xpath("/html/body/section[1]/div[2]/div[2]/div[3]/span[1]/label/text()").get()  # 挂牌价格
        c_cycle = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[2]/label/text()').get()  # 成交周期
        t_frequency = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[3]/label/text()').get()  # 调价次数
        watch_num = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[4]/label/text()').get()  # 带看次数
        focus_num = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[5]/label/text()').get()  # 关注人数
        l_browse = response.xpath('/html/body/section[1]/div[2]/div[2]/div[3]/span[6]/label/text()').get()  # 浏览次数

        item['c_title'] = c_title
        item['h_type'] = h_type
        item['p_square'] = p_square
        item['c_time'] = c_time
        item['c_price'] = c_price
        item['g_price'] = g_price
        item['c_cycle'] = c_cycle
        item['t_frequency'] = t_frequency
        item['watch_num'] = watch_num
        item['focus_num'] = focus_num
        item['l_browse'] = l_browse
        print(item)
        yield item

    # 得到所有的列表页 然后去访问每套房子的详情页
    def parse_item(self, response):
        info_ilst = response.xpath('//ul[@class="listContent"]/li')
        for info in info_ilst:
            title = info.xpath("div[@class='info']/div[@class='title']/a/text()").get()
            link = info.xpath("div[@class='info']/div[@class='title']/a/@href ").get()
            print(title, link)
            yield scrapy.Request(url=response.urljoin(link), callback=self.parse_info,
                                 headers={'User-Agent': self.ua.random}, )

    # 翻页 得到北京所有交易的列表页
    def next_page(self, response):
        page_url = response.xpath('//@page-url').extract_first()
        page_data = response.xpath('//@page-data').extract_first()
        total_page = eval(page_data)['totalPage']
        # total_page = 2
        for page in range(1, total_page + 1):
            rel_url = page_url.format(page=page)
            # print(rel_url)
            yield scrapy.Request(url=response.urljoin(rel_url), callback=self.parse_item,
                                 headers={'User-Agent': self.ua.random}, )

    def all_links(self, response):
        links = response.xpath('//div[@data-role="ershoufang"]//a/@href').getall()
        for url in links:
            yield scrapy.Request(url=response.urljoin(url), callback=self.next_page,
                                 headers={'User-Agent': self.ua.random}, )

seting 设置

from fake_useragent import UserAgent
import random
import time

BOT_NAME = 'lianjia'

SPIDER_MODULES = ['lianjia.spiders']
NEWSPIDER_MODULE = 'lianjia.spiders'

ua = UserAgent()
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'lianjia (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 开启日志
LOG_ENABLED = True
# 日志名称及路径
LOG_FILE = 'lianjia.log'
# 设置日志编码
LOG_ENCODING = 'utf-8'
# 设置日志等级  警告级以上写入
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = time.sleep(random.random() * 10)# 随机休息防止反爬
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'UserAgent': ua.random,
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
#    #  'lianjia.middlewares.ProxyMiddleware':543
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#     'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'lianjia.pipelines.LianjiaPipeline': 300,
}

items设置

import scrapy


class LianjiaItem(scrapy.Item):

    c_title = scrapy.Field()  # 小区名  title

    h_type = scrapy.Field()  # 户型

    p_square =scrapy.Field()# 平米

    c_time = scrapy.Field()# 成交时间

    c_price = scrapy.Field()# 成交价格

    g_price = scrapy.Field()# 挂牌价格

    c_cycle = scrapy.Field()# 成交周期

    t_frequency = scrapy.Field()# 调价次数

    focus_num = scrapy.Field()# 关注人数

    watch_num = scrapy.Field()# 带看次数

    l_browse = scrapy.Field()# 浏览次数
    # price = scrapy.Field()
    # average_price = scrapy.Field()
    # link = scrapy.Field()

管道设置

from scrapy.exporters import JsonLinesItemExporter


class LianjiaPipeline:
    # 初始化创建文件
    def __init__(self):
        self.file = open('lianjia.json', 'wb', )
        self.exproter = JsonLinesItemExporter(self.file, ensure_ascii=False, encoding='utf-8')

    # 写入数据
    def process_item(self, item, spider):
        self.exproter.export_item(item, )

        return item

    # 自动关闭文件
    def close_item(self, spider):
        self.file.close()