爬取链家租房数据,数据处理,进行可视化分析

 lianjiaspider.py

import asyncio
import aiohttp
import pandas as pd
from lxml import etree


class LianjiaSpider(object):

    def __init__(self):
        self._headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
        self._data = list()

    async def get(self, url):
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, headers=self._headers, timeout=3) as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        return result
            except Exception as e:
                print(e.args)

    async def parse_html(self):
        for page in range(1, 77):
            url = "https://sjz.lianjia.com/zufang/pg{}/".format(page)
            print("正在爬取{}".format(url))
            html = await self.get(url)  # 获取网页内容
            html = etree.HTML(html)
            self.parse_page(html)
            print("正在存储数据....")
            data = pd.DataFrame(self._data)
            data.to_csv("lianjia.csv", encoding='utf_8_sig')  # 写入文件

    def parse_page(self, html):
        info_panel = html.xpath("//div[@class='info-panel']")
        for info in info_panel:
            region = info.xpath(".//span[@class='region']/text()")
            zone = info.xpath(".//span[@class='zone']/span/text()")
            meters = info.xpath(".//span[@class='meters']/text()")
            where = info.xpath(".//div[@class='where']/span[4]/text()")

            con = info.xpath(".//div[@class='con']/text()")
            floor = con[0]  # 楼层
            type = con[1]  # 样式

            agent = info.xpath(".//div[@class='con']/a/text()")[0]

            has = info.xpath(".//div[@class='left agency']//text()")

            price = info.xpath(".//div[@class='price']/span/text()")[0]
            price_pre = info.xpath(".//div[@class='price-pre']/text()")[0]
            look_num = info.xpath(".//div[@class='square']//span[@class='num']/text()")[0]

            one_data = {
                "region": region,
                "zone": zone,
                "meters": meters,
                "where": where,
                "louceng": floor,
                "type": type,
                "xiaoshou": agent,
                "has": has,
                "price": price,
                "price_pre": price_pre,
                "num": look_num
            }
            self._data.append(one_data)  # 添加数据

    def run(self):
        loop = asyncio.get_event_loop()
        tasks = [asyncio.ensure_future(self.parse_html())]
        loop.run_until_complete(asyncio.wait(tasks))


if __name__ == '__main__':
    Lian_jia = LianjiaSpider()
    Lian_jia.run()

lianjia.csv

  ,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,['集中供暖'],高楼层(共33层),['127.86平米\xa0\xa0'],0,2300,2018.11.08 更新,['凤凰城梧桐苑\xa0\xa0'],板楼,['南'],南焦租房,['3室2厅\xa0\xa0']
1,['集中供暖'],中楼层(共6层),['55平米\xa0\xa0'],0,1200,2018.11.04 更新,['华兴小区\xa0\xa0'],板楼,['南'],世纪公园租房,['1室1厅\xa0\xa0']
2,['集中供暖'],中楼层(共6层),['138平米\xa0\xa0'],0,2400,2018.11.04 更新,['河冶小区\xa0\xa0'],板楼,['南 北'],跃进租房,['3室2厅\xa0\xa0']
3,['集中供暖'],低楼层(共6层),['90平米\xa0\xa0'],1,1500,2018.11.06 更新,['瑞国花园\xa0\xa0'],板楼,['南'],跃进租房,['2室2厅\xa0\xa0']
4,['集中供暖'],低楼层(共14层),['180平米\xa0\xa0'],0,3500,2018.11.13 更新,['华脉新村\xa0\xa0'],板楼,['南 北'],四十中学租房,['4室2厅\xa0\xa0']
5,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['57平米\xa0\xa0'],0,3000,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['西'],南长租房,['1室1厅\xa0\xa0']
6,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['42.56平米\xa0\xa0'],0,2200,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
7,['集中供暖'],中楼层(共34层),['148平米\xa0\xa0'],0,2500,2018.11.08 更新,['北城国际B区\xa0\xa0'],板楼,['南 北'],沿东租房,['3室2厅\xa0\xa0']
8,"['近地铁', '随时看房', '集中供暖']",中楼层(共40层),['40.09平米\xa0\xa0'],0,2100,2018.11.09 更新,['华润大厦\xa0\xa0'],塔楼,['南'],南长租房,['1室1厅\xa0\xa0']
9,"['近地铁', '集中供暖']",低楼层(共33层),['185平米\xa0\xa0'],0,22000,2018.11.10 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']
10,"['近地铁', '集中供暖']",低楼层(共33层),['242平米\xa0\xa0'],0,29000,2018.11.05 更新,['青鸟中山华府\xa0\xa0'],板楼,['北'],大经租房,['1室1厅\xa0\xa0']

去除无用字符

import re

f = open("lian_jia.csv", 'w', encoding='utf-8')
filename = 'lianjia.csv'
with open(filename, 'r', encoding='utf-8')as file:
    frd = file.readlines()
    for i in frd:
        pattern = re.compile(r'xa0')
        out = re.sub(pattern, '', i)
        s = "".join("".join("".join(out.split("\\\\")).split("']")).split("['"))
        d = "".join(s.split("平米"))
        f.write(d)

lian_jia.csv

 ,has,louceng,meters,num,price,price_pre,region,type,where,xiaoshou,zone
0,集中供暖,高楼层(共33层),127.86,0,2300,2018.11.08 更新,凤凰城梧桐苑,板楼,南,南焦租房,3室2厅
1,集中供暖,中楼层(共6层),55,0,1200,2018.11.04 更新,华兴小区,板楼,南,世纪公园租房,1室1厅
2,集中供暖,中楼层(共6层),138,0,2400,2018.11.04 更新,河冶小区,板楼,南 北,跃进租房,3室2厅
3,集中供暖,低楼层(共6层),90,1,1500,2018.11.06 更新,瑞国花园,板楼,南,跃进租房,2室2厅
4,集中供暖,低楼层(共14层),180,0,3500,2018.11.13 更新,华脉新村,板楼,南 北,四十中学租房,4室2厅
5,"近地铁', '随时看房', '集中供暖",中楼层(共40层),57,0,3000,2018.11.09 更新,华润大厦,塔楼,西,南长租房,1室1厅
6,"近地铁', '随时看房', '集中供暖",中楼层(共40层),42.56,0,2200,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
7,集中供暖,中楼层(共34层),148,0,2500,2018.11.08 更新,北城国际B区,板楼,南 北,沿东租房,3室2厅
8,"近地铁', '随时看房', '集中供暖",中楼层(共40层),40.09,0,2100,2018.11.09 更新,华润大厦,塔楼,南,南长租房,1室1厅
9,"近地铁', '集中供暖",低楼层(共33层),185,0,22000,2018.11.10 更新,青鸟中山华府,板楼,北,大经租房,1室1厅
10,"近地铁', '集中供暖",低楼层(共33层),242,0,29000,2018.11.05 更新,青鸟中山华府,板楼,北,大经租房,1室1厅

可视化户型数量分布

import pandas as pd
import matplotlib.pyplot as plt

house = pd.read_csv('lian_jia.csv', names=['', 'has', 'louceng', 'meters', 'num', 'price', 'price_pre', 'region', 'type', 'where','xiaoshou', 'zone'])
zone = house['zone'].value_counts()
plt.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体
asd, sdf = plt.subplots(1, 1, dpi=200)  # 设置画布
zone.head(10).plot(kind='bar', x='zone', y='size', title='户型数量分布', ax=sdf)  # 获取前10条数据
plt.legend(['数量'])
plt.show()

柱状图

 

  • 8
    点赞
  • 34
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值