scrapy爬虫之贝壳房产信息

scrapy爬虫之贝壳房产信息

今天分享的代码是scrapy框架爬取贝壳房产的信息,保存到mysql中,然后通过pyecharts进行数据分析得到html,最终通过整个html得到完整的数据分析网页。还请各位看官有错指错,大家一起学习进步~话不多说,直接上代码!

一、scrapy框架代码:

1、item.py

import scrapy


class BeikeItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    biaoti = scrapy.Field()

    didian = scrapy.Field()

    huxing = scrapy.Field()

    jiage = scrapy.Field()

    danjia = scrapy.Field()

    biaoqian = scrapy.Field()

    mianji = scrapy.Field()

2、middlewares.py

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class BeikeSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class BeikeDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

3、pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class BeikePipeline:
    def __init__(self):
        self.connect = pymysql.connect(host="localhost", user="root", passwd="1234", db="beike")
        self.cursor = self.connect.cursor()
        print("数据库连接成功")

    def process_item(self, item, spider):
        print("开始保存数据")

        insql = "insert into beike_beijing(biaoti,didian,huxing,jiage,danjia,biaoqian,mianji) values (%s,%s,%s,%s,%s,%s,%s)"

        self.cursor.execute(insql, (
            item['biaoti'], item['didian'], item['huxing'], item['jiage'], item['danjia'],
            item['biaoqian'], item['mianji']))

        self.connect.commit()

        print("保存数据成功")

        return item

    def parse_close(self):
        self.connect.close()
        self.cursor.close()

4、settings.py

# Scrapy settings for beike project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'beike'

SPIDER_MODULES = ['beike.spiders']
NEWSPIDER_MODULE = 'beike.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'beike (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    'Cookie': 'digv_extends=%7B%22utmTrackId%22%3A%2280418643%22%7D; lianjia_uuid=3d726c57-6d3f-4f6c-95a2-8b7abc9faeac; select_city=110000; lianjia_ssid=4473e3e6-43e7-4181-bb6a-ebb23ef4ec07; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221769ce5aab93e7-0c28ca07c23265-59442e11-1327104-1769ce5aaba695%22%2C%22%24device_id%22%3A%221769ce5aab93e7-0c28ca07c23265-59442e11-1327104-1769ce5aaba695%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E4%BB%98%E8%B4%B9%E5%B9%BF%E5%91%8A%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E8%B4%9D%E5%A3%B3%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wymoren%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; crosSdkDT2019DeviceId=sgaxf7--ohhq7q-mq2s3hm3qk16atd-otg4hxhsr; _ga=GA1.2.999642397.1608950071; _gid=GA1.2.1417890696.1608950071; __xsptplusUT_788=1; __xsptplus788=788.1.1608950072.1608950072.1%234%7C%7C%7C%7C%7C%23%23duwbmR1LtYCy9OIqePHhHWS1htLXHyiz%23; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1608950065,1608950073,1608950215; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1608950261; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiM2M1NmJhYjliNzhmYzhhYzYzYWUyZGVjOWZmZWJjMjQwYzJhZmFlYmRjZTk4YWU2M2E3MDU4MjY3MDFlNDc5MThlNzkwMDI1NWM4NzNkYTA5YmQyZjBkZDFjZGIxZDg1YmJkMDlmODlmYzFkZGQxOTNiNGI3ZGU5MTU5ZmZlYWVlNWJlMjIzNTFkNzk2NDJkOTI4ZDYzYWEzNjkwYTVlNGU3MDRhMDcxYzQ5NDhmN2RiMzdjMGZiZGExZGY3NzdlZjYyMWZkOGMwMTAzMGNlZmUxNWZmYzAyMjlkODA0MTczZjE1MGRmOTFiYjZjZTgzNDEyY2JlOThjNDMwYzI1YjU2NGI2M2Q4ZTUxZjA5ZmM5MTgyMGVjZWY2OTA2ZDhkN2JiYWYxMzFkZDkxZjU3YjUxZWZhNTZjM2EyNzczMGI4ODgxNGFhNGViNjA5YjlhMjMxYmI0OWZiNzEyNzBhNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJjNDBjMDg1ZVwifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nL3BnMi8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ==; login_ucid=2000000074667028; lianjia_token=2.0015e8780f68fe41f70445513e50d1f7b5; lianjia_token_secure=2.0015e8780f68fe41f70445513e50d1f7b5; security_ticket=WyjQtDuz1ImoP8myKzaHDGUewY7FuWIViEWxA+VfVYPS1kh3NigeIWicj7EQoTgPFJUTK6nPMHlbU+pvTlI4XRKfiyiRoeyEjIqFkcidofJneE75XwFlyXW1/eb85/AktQwvEFK2zqJHTb5owtGQiVxFGh2l/UFVDVJMjHsN4Ec='
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'beike.middlewares.BeikeSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'beike.middlewares.BeikeDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'beike.pipelines.BeikePipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

5、beikebeijing.py

import scrapy
from beike.items import BeikeItem
import time


class BeikebeijingSpider(scrapy.Spider):
    n_id = 2
    name = 'beikebeijing'
    allowed_domains = ['ke.com']
    start_urls = ['https://bj.ke.com/ershoufang/pg1/']

    def parse(self, response):

        item = BeikeItem()

        for bk in response.xpath("//div[@data-component='list']/ul/li"):

            biaoti = bk.xpath(".//div[@class='info clear']/div[@class='title']/a/text()").extract()
            # time.sleep(1)
            if len(biaoti) > 0:
                item['biaoti'] = biaoti[0]
            else:
                item['biaoti'] = ""
            # print(biaoti)

            didian = bk.xpath(".//div[@class='positionInfo']/a/text()").extract()
            time.sleep(1)
            if len(didian) > 0:
                item['didian'] = didian[0]
            else:
                item['didian'] = ""
            # print(didian)

            huxing = bk.xpath(".//div[@class='houseInfo']/text()").extract()
            # time.sleep(1)
            if len(huxing) > 0:
                item['huxing'] = huxing[0]
            else:
                item['huxing'] = ""
            # print(huxing)

            jiage = bk.xpath(".//div[@class='totalPrice']/span/text()").extract()
            # time.sleep(1)
            if len(jiage) > 0:
                item['jiage'] = jiage[0]
            else:
                item['jiage'] = ""
            # print(jiage)

            danjia = bk.xpath(".//div[@class='unitPrice']/span/text()").extract()
            # time.sleep(1)
            if len(danjia) > 0:
                dj = danjia[0].replace("单价", "")
                item['danjia'] = dj.replace("元/平米", "")
            else:
                item['danjia'] = ""
            # print(danjia)

            biaoqian = bk.xpath(".//div[@class='tag']/span/text()").extract()
            time.sleep(1)
            if len(biaoqian) > 0:
                item['biaoqian'] = biaoqian[0]
            else:
                item['biaoqian'] = ""
            # print(biaoqian)

            mianji = bk.xpath(".//div[@class='houseInfo']/text()").extract()
            # time.sleep(1)
            if len(mianji) > 0:
                item['mianji'] = mianji[0]
            else:
                item['mianji'] = ""
            # print(mianji)

            yield item

        n_url = "https://bj.ke.com/ershoufang/pg{}/".format(self.n_id)

        if self.n_id < 100:
            time.sleep(5)

            yield scrapy.Request(url=n_url, dont_filter=True, callback=self.parse)
            # print(self.page_id)

            self.n_id += 1

二、数据分析代码:

1.饼图.html

import pymysql
from pyecharts.charts import Bar, Pie, Line
import pyecharts.options as opts


def select_huxing_jiage():
    conn = pymysql.connect('localhost', 'root', '1234', 'beike')

    cur = conn.cursor()

    select_sql = "SELECT huxing,SUM(jiage) FROM beike_beijing GROUP BY huxing;"

    cur.execute(select_sql)

    result1 = cur.fetchall()

    # print(result1)
    huxing = []
    jiage = []

    for i in result1:
        huxing.append(i[0])
        jiage.append(int(i[1]))

    return huxing, jiage


def bingtu():  # 饼图
    huxing, jiage = select_huxing_jiage()
    c = (
        Pie()
            .add("", [list(z) for z in zip(huxing, jiage)],
                 center=["50%", "60%"])
            .set_global_opts(title_opts=opts.TitleOpts(title="户型价格销量比例图"))
            .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
    c.render("价格饼图.html")


if __name__ == '__main__':
    bingtu()

2.柱状图.html

import pymysql
from pyecharts.charts import Bar, Pie, Line
import pyecharts.options as opts


def select_huxing_jiage():
    conn = pymysql.connect('localhost', 'root', '1234', 'beike')

    cur = conn.cursor()

    select_sql = "SELECT huxing,SUM(jiage) FROM beike_beijing GROUP BY huxing;"

    cur.execute(select_sql)

    result1 = cur.fetchall()

    # print(result1)
    huxing = []
    jiage = []

    for i in result1:
        huxing.append(i[0])
        jiage.append(int(i[1]))

    return huxing, jiage


def zhuzhuangtu():  # 柱状图
    huxing, jiage = select_huxing_jiage()
    bar = Bar(init_opts=opts.InitOpts(width='1000px', height='600px'))
    bar.add_xaxis(huxing)
    bar.add_yaxis("销量", jiage)
    bar.set_global_opts(title_opts=opts.TitleOpts("各户型价格统计"))
    bar.set_series_opts(label_opts=opts.LabelOpts(position="top"))
    bar.render("价格柱状图.html")


if __name__ == '__main__':
    zhuzhuangtu()

3.词云

from pyecharts.charts import WordCloud
import jieba
import pymysql
import wordcloud


def ciyun_beike():
    conn = pymysql.connect('localhost', 'root', '1234', 'beike')

    cur = conn.cursor()

    select_dangdang_sql = "SELECT biaoti FROM beike_beijing;"

    cur.execute(select_dangdang_sql)

    beike = cur.fetchall()

    beike_list = []

    for i in beike:
        beike_list.append(i[0])

    dd_str = " ".join(beike_list)

    # print(lj)

    lj = wordcloud.WordCloud(font_path="词云字体.ttf", width=1000, height=1000)

    lj.generate(dd_str)

    lj.to_file("贝壳词云.png")


if __name__ == '__main__':
    ciyun_beike()

还有其他一些数据分析图就不放了 太多了…

结果:

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

如果想要整个源代码的可以去这个链接下载:https://download.csdn.net/download/liuxueyingwxnl/14951927

更多案例请关注作者微信公众号:PyDream 欢迎一起交流学习!

作者微信公众号,欢迎来一起交流学习

  • 3
    点赞
  • 23
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

蔡霸霸i

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值