爬取链家租房数据，数据处理，进行可视化分析

最新推荐文章于 2024-06-05 21:52:54 发布

Feng_MuJin

最新推荐文章于 2024-06-05 21:52:54 发布

阅读量8.1k

点赞数 9

分类专栏：爬虫数据分析数据可视化文章标签： python

本文链接：https://blog.csdn.net/qq_42029527/article/details/84061033

版权

爬虫同时被 3 个专栏收录

17 篇文章 0 订阅

订阅专栏

数据可视化

10 篇文章 0 订阅

订阅专栏

数据分析

2 篇文章 0 订阅

订阅专栏

lianjiaspider.py

import asyncio
import aiohttp
import pandas as pd
from lxml import etree


class LianjiaSpider(object):

    def __init__(self):
        self._headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
        self._data = list()

    async def get(self, url):
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, headers=self._headers, timeout=3) as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        return result
            except Exception as e:
                print(e.args)

    async def parse_html(self):
        for page in range(1, 77):
            url = "https://sjz.lianjia.com/zufang/pg{}/".format(page)
            print("正在爬取{}".format(url))
            html = await self.get(url)  # 获取网页内容
            html = etree.HTML(html)
            self.parse_page(html)
            print("正在存储数据....")
            data = pd.DataFrame(self._data)
            data.to_csv("lianjia.csv", encoding='utf_8_sig')  # 写入文件

    def parse_page(self, html):
        info_panel = html.xpath("//div[@class='info-panel']")
        for info in info_panel:
            region = info.xpath(".//span[@class='region']/text()")
            zone = info.xpath(".//span[@class='zone']/span/text()")
            meters = info.xpath(".//span[@class='meters']/text()")
            where = info.xpath(".//div[@class='where']/span[4]/text()")

            con = info.xpath(".//div[@class='con']/text()")
            floor = con[0]  # 楼层
            type = con[1]  # 样式

            agent = info.xpath(".//div[@class='con']/a/text()")[0]

            has = info.xpath(".//div[@class='left agency']//text()")

            price = info.xpath(".//div[@class='price']/span/text()")[0]
            price_pre = info.xpath(".//div[@class='price-pre']/text()")[0]
            look_num = info.xpath(".//div[@class='square']//span[@class='num']/text()")[0]

            one_data = {
                "region": region,
                "zone": zone,
                "meters": meters,
                "where": where,
                "louceng": floor,
                "type": type,
                "xiaoshou": agent,
                "has": has,
                "price": price,
                "price_pre": price_pre,
                "num": look_num
            }
            self._data.append(one_data)  # 添加数据

    def run(self):
        loop = asyncio.get_event_loop()
        tasks = [asyncio.ensure_future(self.parse_html())]
        loop.run_until_complete(asyncio.wait(tasks))


if __name__ == '__main__':
    Lian_jia = LianjiaSpider()
    Lian_jia.run()

lianjia.csv

  ,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,['集中供暖'],高楼层(共33层),['127.86平米\xa0\xa0'],0,2300,2018.11.08 更新,['凤凰城梧桐苑\xa0\xa0'],板楼,['南'],南焦租房,['3室2厅\xa0\xa0']
1,['集中供暖'],中楼层(共6层),['55平米\xa0\xa0'],0,1200,2018.11.04 更新,['华兴小区\xa0\xa0'],板楼,['南'],世纪公园租房,['1室1厅\xa0\xa0']
2,['集中供暖'],中楼层(共6层),['138平米\xa0\xa0'],0,2400,2018.11.04 更新,['河冶小区\xa0\xa0'],板楼,['南 北'],跃进租房,['3室2厅\xa0\xa0']
3,['集中供暖'],低楼层(共6层),['90平米\xa0\xa0'],1,1500,2018.11.06 更新,['瑞国花园\xa0\xa0'],板楼,['南'],跃进租房,['2室2厅\xa0\xa0']
4,['集中供暖'],低楼层(共14层),['180平米\xa0\xa0'],0,3500,2018.11.13 更新,['华脉新村\xa0\xa0'],板楼,['南 北'],四十中学租房,['4室2厅\xa0\xa0']
5,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['57平米\xa0\xa0'],0,3000,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['西'],南长租房,['1室1厅\xa0\xa0']
6,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['42.56平米\xa0\xa0'],0,2200,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
7,['集中供暖'],中楼层(共34层),['148平米\xa0\xa0'],0,2500,2018.11.08 更新,['北城国际B区\xa0\xa0'],板楼,['南 北'],沿东租房,['3室2厅\xa0\xa0']
8,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['40.09平米\xa0\xa0'],0,2100,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
9,"['近地铁', '集中供暖']",低楼层(共33层),['185平米\xa0\xa0'],0,22000,2018.11.10 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']
10,"['近地铁', '集中供暖']",低楼层(共33层),['242平米\xa0\xa0'],0,29000,2018.11.05 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']

去除无用字符

import re

f = open("lian_jia.csv", 'w', encoding='utf-8')
filename = 'lianjia.csv'
with open(filename, 'r', encoding='utf-8')as file:
    frd = file.readlines()
    for i in frd:
        pattern = re.compile(r'xa0')
        out = re.sub(pattern, '', i)
        s = "".join("".join("".join(out.split("\\\\")).split("']")).split("['"))
        d = "".join(s.split("平米"))
        f.write(d)

lian_jia.csv

 ,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,集中供暖,高楼层(共33层),127.86,0,2300,2018.11.08 更新,凤凰城梧桐苑,板楼,南,南焦租房,3室2厅
1,集中供暖,中楼层(共6层),55,0,1200,2018.11.04 更新,华兴小区,板楼,南,世纪公园租房,1室1厅
2,集中供暖,中楼层(共6层),138,0,2400,2018.11.04 更新,河冶小区,板楼,南 北,跃进租房,3室2厅
3,集中供暖,低楼层(共6层),90,1,1500,2018.11.06 更新,瑞国花园,板楼,南,跃进租房,2室2厅
4,集中供暖,低楼层(共14层),180,0,3500,2018.11.13 更新,华脉新村,板楼,南 北,四十中学租房,4室2厅
5,"近地铁', '随时看房', '集中供暖",中楼层(共40层),57,0,3000,2018.11.09 更新,华润大厦,塔楼,西,南长租房,1室1厅
6,"近地铁', '随时看房', '集中供暖",中楼层(共40层),42.56,0,2200,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
7,集中供暖,中楼层(共34层),148,0,2500,2018.11.08 更新,北城国际B区,板楼,南 北,沿东租房,3室2厅
8,"近地铁', '随时看房', '集中供暖",中楼层(共40层),40.09,0,2100,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
9,"近地铁', '集中供暖",低楼层(共33层),185,0,22000,2018.11.10 更新,青鸟中山华府,板楼,北,大经租房,1室1厅
10,"近地铁', '集中供暖",低楼层(共33层),242,0,29000,2018.11.05 更新,青鸟中山华府,板楼,北,大经租房,1室1厅

可视化户型数量分布

import pandas as pd
import matplotlib.pyplot as plt

house = pd.read_csv('lian_jia.csv', names=['', 'has', 'louceng', 'meters', 'num', 'price', 'price_pre', 'region', 'type', 'where','xiaoshou', 'zone'])
zone = house['zone'].value_counts()
plt.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体
asd, sdf = plt.subplots(1, 1, dpi=200)  # 设置画布
zone.head(10).plot(kind='bar', x='zone', y='size', title='户型数量分布', ax=sdf)  # 获取前10条数据
plt.legend(['数量'])
plt.show()

柱状图

Feng_MuJin

关注

9
点赞
踩
37

收藏

觉得还不错? 一键收藏
2
评论
爬取链家租房数据，数据处理，进行可视化分析

lianjiaspider.pyimport asyncioimport aiohttpimport pandas as pdfrom lxml import etreeclass LianjiaSpider(object): def __init__(self): self._headers = { "User-Agent":...
复制链接

扫一扫

专栏目录