------------本篇文章旨在练习数据的爬取及可视化
简介
本项目分两个部分:
1.爬虫:共爬取到链家杭州二手房信息30806条
2.可视化:主要用pyecharts
一、获取数据
- 提取杭州各个行政区二手房信息
- 将得到的数据保存为DataFrame
导入相关的包
#-*-coding:utf-8-*-
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import math
import requests
import lxml
import re
import time
area_dic = {'西湖区':'xihu',
'钱塘新区':'qiantangxinqu',
'下城区':'xiacheng',
'江干区':'jianggan',
'拱墅区':'gongshu',
'上城区':'shangcheng',
'滨江区':'binjiang',
'余杭区':'yuhang',
'萧山区':'xiaoshan',
'淳安区':'chunan1',
'富阳区':'fuyang',
'临安区':'linan'}
# 加个header以示尊敬
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
'Referer': 'https://hz.lianjia.com/ershoufang/'}
# 新建一个会话
sess = requests.session()
sess.get('https://hz.lianjia.com/ershoufang/', headers=headers)
# url示例:https://sz.lianjia.com/ershoufang/xihu/pg2/
url = 'https://hz.lianjia.com/ershoufang/{}/pg{}/'
# 当正则表达式匹配失败时,返回默认值(errif)
def re_match(re_pattern, string, errif=None):
try:
return re.findall(re_pattern, string)[0].strip()
except IndexError:
return errif
开始获取房源数据
# 新建一个DataFrame存储信息
data = pd.DataFrame()
for key_, value_ in area_dic.items():
# 获取该行政区下房源记录数
start_url = 'https://hz.lianjia.com/ershoufang/{}/'.format(value_)
html = sess.get(start_url).text
house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0].strip()
print('正在获取{}数据: 二手房源共计「{}」套'.format(key_, house_num))
time.sleep(1)
# 由于页面限制, 所以每个行政区只能获取最多100页共计3000条房源信息
total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
for i in tqdm(range(total_page), desc=key_):
html = sess.get(url.format(value_, i+1)).text
soup = BeautifulSoup(html, 'lxml')
info_collect = soup.find_all(class_="info clear")
for info in info_collect:
info_dic = {}
# 行政区
info_dic['area'] = key_
# 房源的标题
info_dic['title'] = re_match('target="_blank">(.*?)</a><!--', str(info))
# 小区名
info_dic['community'] = re_match('xiaoqu.*?target="_blank">(.*?)</a>', str(info))
# 位置
info_dic['position'] = re_match('<a href.*?target="_blank">(.*?)</a>.*?class="address">', str(info))
# 税相关,如房本满5年
info_dic['tax'] = re_match('class="taxfree">(.*?)</span>', str(info))
# 总价
info_dic['total_price'] = float(re_match('class="totalPrice"><span>(.*?)</span>万', str(info)))
# 单价
info_dic['unit_price'] = float(re_match('data-price="(.*?)"', str(info)))
# 匹配房源标签信息,通过|切割
# 包括面积,朝向,装修等信息
icons = re.findall('class="houseIcon"></span>(.*?)</div>', str(info))[0].strip().split('|')
info_dic['hourseType'] = icons[0].strip()
info_dic['hourseSize'] = float(icons[1].replace('平米', ''))
info_dic['direction'] = icons[2].strip()
info_dic['fitment'] = icons[3].strip()
# 存入DataFrame
if data.empty:
data = pd.DataFrame(info_dic,index=[0])
else:
data = data.append(info_dic,ignore_index=True)
查看下爬取的数据
data.head()
由于数据是保存在 data里面,所以这里将它保存到csv文件中,方便读取
outputfile='./hangzhou_lianjia.csv'
data.to_csv(outputfile,index=False,encoding='utf_8_sig',header=True)
二、数据可视化
导入相关包
from pyecharts.charts import *
from pyecharts.charts import Bar
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
from jieba import posseg as psg
import collections
import pandas as pd
from tqdm import tqdm
import math
导入数据
inputfile='./hangzhou_lianjia.csv'
data=pd.read_csv(inputfile)
data.head()
查看数据描述性统计
data.describe()
1、房源面积-总价散点图
scatter = (Scatter(init_opts=opts.InitOpts(theme='dark'))
.add_xaxis(data['hourseSize'])
.add_yaxis("房价", data['total_price'])
.set_series_opts(label_opts=opts.LabelOpts(is_show=False),
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="最大值"),]))
.set_global_opts(
legend_opts=opts.LegendOpts(is_show=False),
title_opts=opts.TitleOpts(title="杭州二手房 总价-面积 散点图"),
xaxis_opts=opts.AxisOpts(
name='面积',
# 设置坐标轴为数值类型
type_="value",
# 不显示分割线
splitline_opts=opts.SplitLineOpts(is_show=False)),
yaxis_opts=opts.AxisOpts(
name='总价',
name_location='middle',
# 设置坐标轴为数值类型
type_="value",
# 默认为False表示起始为0
is_scale=True,
splitline_opts=opts.SplitLineOpts(is_show=False),),
visualmap_opts=opts.VisualMapOpts(is_show=True, type_='color', min_=100, max_=1000)
))
scatter.render_notebook()
最贵的一套房源总价是5800W;
2、各行政区均价
temp = data.groupby(['area'])['unit_price'].mean().reset_index()
data_pair = [(row['area'], round(row['unit_price']/10000, 1)) for _, row in temp.iterrows()]
map_ = (Map(init_opts=opts.InitOpts(theme='dark'))
.add("二手房均价", data_pair, '杭州', is_roam=False)
.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
.set_global_opts(
title_opts=opts.TitleOpts(title="杭州各行政区二手房均价"),
legend_opts=opts.LegendOpts(is_show=False),
tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'),
visualmap_opts=opts.VisualMapOpts(min_=3, max_=10)
)
)
map_.render()
map_.render_notebook()
data_pair
- 最贵的是上城区,整体均价4.9W/平米;
- 最便宜的是临安区,均价1.7W/平米;
3、均价最贵的10个小区
temp = data.groupby(['community'])['unit_price'].agg(['mean', 'count']).reset_index()
# 该小区内至少3套在售房源才统计
data_pair = sorted([(row['community'], round(row['mean']/10000, 1)) if row['count']>=3 else (0, 0)
for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
bar = (Bar(init_opts=opts.InitOpts(theme='dark'))
.set_global_opts(title_opts=opts.TitleOpts(),xaxis_opts=opts.AxisOpts(name_rotate=60,axislabel_opts={"rotate":20}))
.add_xaxis([x[0] for x in data_pair])
.add_yaxis( "",[x[1] for x in data_pair])
.set_global_opts(
title_opts=opts.TitleOpts(title="杭州二手房均价TOP 10小区"),
legend_opts=opts.LegendOpts(is_show=False),
tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'))
)
bar.render_notebook()
data_pair
可以看到均价最高的小区是融创大家候潮府,为10.9W
4、均价最贵的10个地段
temp = data.groupby(['position'])['unit_price'].mean().reset_index()
data_pair = sorted([(row['position'], round(row['unit_price']/10000, 1))
for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
bar = (Bar(init_opts=opts.InitOpts(theme='dark'))
.add_xaxis([x[0] for x in data_pair])
.add_yaxis('二手房均价', [x[1] for x in data_pair])
.set_series_opts(label_opts=opts.LabelOpts(is_show=True, font_style='italic'),
itemstyle_opts=opts.ItemStyleOpts(
color=JsCode("""new echarts.graphic.LinearGradient(0, 1, 0, 0,
[{
offset: 0,
color: 'rgb(0,206,209)'
}, {
offset: 1,
color: 'rgb(218,165,32)'
}])"""))
)
.set_global_opts(
title_opts=opts.TitleOpts(title="杭州二手房均价TOP 10地段"),
legend_opts=opts.LegendOpts(is_show=False),
tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'))
)
bar.render_notebook()
5、户型分布
temp = data.groupby(['hourseType'])['area'].count().reset_index()
data_pair = sorted([(row['hourseType'], row['area'])
for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
pie = (Pie(init_opts=opts.InitOpts(theme='dark'))
.add('', data_pair,
radius=["30%", "75%"],
rosetype="radius")
.set_global_opts(title_opts=opts.TitleOpts(title="杭州二手房 户型分布"),
legend_opts=opts.LegendOpts(is_show=False),)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
)
pie.render_notebook()
三室两厅成为主力军;但小户型的占比也很多。
6、标题文本词云图
来看看房源标题中出现最多的词语
word_list = []
stop_words = ['花园','业主','出售']
string = str(''.join([i for i in data['title'] if isinstance(i, str)]))
words = psg.cut(string)
for x in words:
if len(x.word)==1:
pass
elif x.flag in ('m', 'x'):
pass
elif x.word in stop_words:
pass
else:
word_list.append(x.word)
data_pair = collections.Counter(word_list).most_common(100)
wc = (WordCloud()
.add("", data_pair, word_size_range=[20, 100], shape='triangle')
.set_global_opts(title_opts=opts.TitleOpts(title="房源描述词云图"))
)
wc.render_notebook()