数据来自:http://www.zcfcw.cn/
用到的库:
requests
lxml
time
numpy
pyecharts
对邹城2019年在售房价数据可视化
总计爬取了2200条房产售卖信息,筛选得2116条
```python
市中心区 1171
城东区域 475
城北区域 58
城南区域 86
钢山街道 14
唐村镇 26
矿区 22
城西区域 161
北宿镇 12
千泉街道 23
凫山街道 30
中心店镇 11
香城镇 4
张庄镇 3
太平镇 7
其他区域 3
峄山镇 4
看庄镇 1
大束镇 4
各区域在售情况
图:
部分区域平均价位:
各区域高低价位图:
其中:
最高:祥生群贤府,桥东苑,义务商贸城,金都花园
最低:东丽花园,南屯矿兑舟园,峄佳社区,石佳花园
各区域价位集中散点图(x轴):
城东:
城西:
城南:
城北:
市中:
取其中较为代表的小区当前价位:
发布时间图:
爬虫:
import requests
from lxml import etree
import time
list=[]
error=[]
error_down=[]
def find_cont(url):
try:
#url='http://www.zcfcw.cn/sale/search/airall_ly%E4%B8%AA%E4%BA%BA_qy_fx_jg_mj_hx_zx_lc_dd.html'
header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
response=requests.get(url,headers=header,timeout = 500)
dom=etree.HTML(response.text)
for i in range(2,102,2):
place=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[2]/a/text()'))
home_name=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[3]/a/text()'))
home_high=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[5]/a/text()'))
home_tingwei=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[6]/a/text()'))
home_mj=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[8]/a/text()'))
home_danjia=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[9]/a/text()'))
home_money=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[10]/a/text()'))
home_time=''.join(dom.xpath('//*[@id="contectleft"]/ul['+str(i)+']/li[11]/a/text()'))
import csv
import os
one=[[u'地点', u'房产名',u'房子所在楼层',u'房子厅位',u'面积',u'平方售价',u'价格']]
data =[[place, home_name,home_high,home_tingwei,home_mj,home_danjia,home_money,home_time]]
with open('F:/python/zc_home/test.csv', 'a+',newline='') as f:
w = csv.writer(f)
#w.writerows(one)
w.writerows(data)
print(i)
except:
print("error")
error.append(err)
callable
#生成网页链接
def url():
for i in range(1,46):
url='http://www.zcfcw.cn/sale/search/airallpage_ly%E4%B8%AA%E4%BA%BA_qy_fx_jg_mj_hx_zx_lc_dd_page{}.html'.format(i)
list.append(url)
#print(list)
def go():
global err
for err in range(len(list)):
print("开始爬取第{}页".format(err))
print(error)
find_cont(list[err])
time.sleep(1.2)
'''[23, 29, 30, 34, 36, 37, 38]'''
url()
print("网页获取完成")
print("开始获取房产信息")
go()
list_1=[]
#处理404的网页
def go_error():
global err
for i in list_1:
url='http://www.zcfcw.cn/sale/search/airallpage_ly%E4%B8%AA%E4%BA%BA_qy_fx_jg_mj_hx_zx_lc_dd_page{}.html'.format(i)
error_down.append(url)
for err in range(len(list_1)):
print("开始爬取第{}页".format(error_down[err]))
find_cont(error_down[err])
print(error)
#各区域销售总量代码
import collections
dic = collections.Counter(list_homename)
list_homename_1=[]
list_homename_number=[]
for i in dic:
print(i,dic[i])
list_homename_1.append(i)
list_homename_number.append(dic[i])
#柱状图
from pyecharts.charts import Bar
from pyecharts import options as opts
bar_hn = (
Bar()
.add_xaxis(list_homename_1)
.add_yaxis("各小区在售量",list_homename_number)
.set_global_opts(
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)),
title_opts=opts.TitleOpts(title="邹城各区域在售量", subtitle="2019.01-2019.10"))
# 或者直接使用字典参数
# .set_global_opts(title_opts={"text": "主标题", "subtext": "副标题"})
)
bar_hn.render('1.html')
#求区域平均
import numpy
city.loc[city['地区']=='市中心区'][['单价']]
g=numpy.array(city.loc[city['地区']=='市中心区'][['单价']]).tolist()
list_3=[]
list_4=[]
len(g)
for i in g:
list_3.append(i[0])
for i in list_3:
list_4.append(int(i.strip('元/㎡')))
def averagenum(num):
nsum = 0
for i in range(len(num)):
nsum += num[i]
return nsum / len(num)
averagenum(list_4)
#小区平均
import numpy
city.loc[city['小区名称']=='泉兴家园'][['单价']]
g=numpy.array(city.loc[city['小区名称']=='泉兴家园'][['单价']]).tolist()
list_5=[]
list_6=[]
len(g)
for i in g:
list_5.append(i[0])
for i in list_5:
list_6.append(int(i.strip('元/㎡')))
print(sorted(list_6))
def averagenum(num):
nsum = 0
for i in range(len(num)):
nsum += num[i]
return nsum / len(num)
averagenum(list_6)
#散点图
import matplotlib.pyplot as plt
list_0=[]
a= 2000
for i in range(len(list_4)):
list_0.append(a)
a=a+10
print(sorted(list_4))
plt.scatter(list_4,list_0)
plt.show()
#折线图
import pyecharts.options as opts
from pyecharts.faker import Faker
from pyecharts.charts import Line
c = (
Line()
.add_xaxis(['金山新苑','文博苑','昌平花园','燕京花园(西区)','鸿顺御景城','世纪国宏','贵都花园二期','东丽花园','泰和家园','名仕豪庭','欧陆商城','南宫房小区','金山花园','泉兴家园'])
.add_yaxis("价位", ['7328','9241','7555','6778','8012','8199','6690','3477','7799','9204','2935','4345','7617','8928'])
.set_global_opts(title_opts=opts.TitleOpts(title="部分小区平均价位图"),xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45)))
)
c.render('2.html')
#用于处理时间排序的算法
data = []
patt = '(\d+)/(\d+)/(\d+)'
#交换排序
for i in range(len(data)-1):
for x in range(i+1, len(data)):
j = 1
while j<4:
lower = re.match(patt, data[i]).group(j)
upper = re.match(patt, data[x]).group(j)
#print lower,upper
if int(lower) < int(upper):
j = 4
elif int(lower) == int(upper):
j += 1
else:
data[i],data[x] = data[x],data[i]
j = 4
print(data)
其中xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-45))
可以让X轴显示全部
https://pyecharts.org/