让数据更有价值,就需要对抓取的信息进行适当的处理,然后展现出来。
0.打开源码,修改源码该位置选择城市:
1.抓取的安居客不团区域房价,然后计算该区域均价,然后通过matploylib绘图:
2.python源码:
#-*- encoding=UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import xlwt
import time
import xlrd
import matplotlib.pyplot as plt
city="xa" ###城市缩写
sheet_name="西安"
url_area="https://"+str(city)+".fang.anjuke.com/loupan/"
###各个title信息
area_key=r"<a href=https://"+str(city)+".fang.anjuke.com/loupan/"
address_key="<span class=\"list-map\" target=\"_blank\">"
huxing_key="huxing"
url_key="class=\"tags-wrap\" href=\""
panel_key="class=\"tag-panel\""
price_key="class=\"price\""
price_around_key="\"favor-tag around-price\""
tel_key="class=\"tel\">"
###区域
loupan_area=[]
###楼盘名
loupan_title=[]
###楼盘地址
loupan_address=[]
###楼盘户型
loupan_huxing=[]
###楼盘URL
loupan_url=[]
###楼盘panel
loupan_panel=[]
###楼盘price
loupan_price=[]
###楼盘tel
loupan_tel=[]
################################抓取安居客楼盘价格#################################
###获取所有区域
anjuke_area = requests.get(url_area).text
anjuke_area=anjuke_area.split("\n")
area_loupan={}
for ihtml in anjuke_area:
if area_key in ihtml:
area_loupan[ihtml.split(">")[1].split("<")[0]]=ihtml.split("a href=")[1].split(">")[0]+"p"
for ikey in area_loupan.keys():
if ikey[-1] == "线":
continue
print(ikey)
inum = 0
sheet_count = []
while 1:
real_url = area_loupan[ikey] + str(inum + 1) + "w1_/"
inum=inum+1
html = urlopen(real_url)
anjuke_html = BeautifulSoup(html.read())
###得到原始信息
title_key_start = "<span class=\"items-name\">"
loupan_arr = []
for ihtml in anjuke_html:
data = str(ihtml).split(title_key_start)
if len(data) > 1:
for i in data:
loupan_arr.append(i)
###筛选原始信息
title_key_stop="<!--邮箱订阅-->"
dest_loupan_arr = []
for i in range(len(loupan_arr)):
if i != 0 and i != len(loupan_arr) - 1:
dest_loupan_arr.append(loupan_arr[i])
if i == len(loupan_arr) - 1:
data = str(loupan_arr[i]).split(title_key_stop)
dest_loupan_arr.append(data[0])
###解析网页
for i in dest_loupan_arr:
price_flag = 0 ##部分楼盘售价待定
huxing_flag = 0 ##部分楼盘户型未知
tel_flag = 0 ##部分楼盘电环未知
data = str(i).split("\n")
for j in range(len(data)):
if j == 0: # loupan_key
loupan_title.append(data[j].split("<")[0])
continue
if address_key in data[j]:
loupan_address.append(data[j].split(address_key)[1].split("<")[0])
continue
if huxing_key in data[j]:
huxing_flag = 1
real_j = j + 1
tmp_huxing_str = ""
while data[real_j] != "</a>":
if "<span>" in data[real_j]:
tmp = data[real_j].split("<span>")
for it in tmp:
if "<" in it:
t = it.split("<")[0]
if "建筑面积" in t:
tmp_huxing_str = tmp_huxing_str.rstrip("/") + " "
tmp_huxing_str = tmp_huxing_str + t
else:
tmp_huxing_str = tmp_huxing_str + t + "/"
else:
tmp_huxing_str = tmp_huxing_str + data[real_j].strip()
real_j = real_j + 1
loupan_huxing.append(tmp_huxing_str)
continue
if url_key in data[j]:
loupan_url.append(data[j].split(url_key)[1].split("\"")[0])
continue
if panel_key in data[j]:
real_j = j + 1
tmp_panel_str = ""
while data[real_j] != "</div>":
tmp_panel_str = tmp_panel_str + data[real_j].split(">")[1].split("<")[0] + " "
real_j = real_j + 1
loupan_panel.append(tmp_panel_str.strip())
continue
if price_key in data[j]:
price_flag = 1
tmp = data[j].split(">")
tmp_price_str = ""
for it in tmp:
tmp_price_str = tmp_price_str + it.split("<")[0]
loupan_price.append(tmp_price_str)
continue
if price_around_key in data[j]:
price_flag = 1
real_j = j + 1
tmp = data[real_j].split(">")
tmp_price_str = ""
for it in tmp:
tmp_price_str = tmp_price_str + it.split("<")[0].strip()
loupan_price.append(tmp_price_str)
continue
if tel_key in data[j]:
tel_flag = 1
loupan_tel.append(data[j].split(tel_key)[1].split("<")[0])
continue
if price_flag == 0:
loupan_price.append("售价待定")
if huxing_flag == 0:
loupan_huxing.append("户型未知")
if tel_flag == 0:
loupan_tel.append("号码未知")
loupan_area.append(ikey)
if sheet_count != [] and sheet_count[-1] != len(dest_loupan_arr):
break
sheet_count.append(len(dest_loupan_arr))
excel_col=[]
excel_col.append([u'楼盘',u'价格',u'区域',u'户型',u'地址',u'状态',u'网址',u'电话'])
for icol in range(len(loupan_title)):
if "套" in loupan_price[icol]:
continue
if "套" not in loupan_price[icol]:
tmp_price = ""
for i in loupan_price[icol]:
if str(i) >= '0' and str(i) <= '9':
tmp_price=tmp_price+i
if tmp_price != "":
loupan_price[icol]=int(tmp_price)
else:
continue
tmp=[loupan_title[icol],loupan_price[icol],loupan_area[icol],loupan_huxing[icol],loupan_address[icol],loupan_panel[icol],loupan_url[icol],loupan_tel[icol]]
excel_col.append(tmp)
####写入excel
app = xlwt.Workbook() #创建工作簿
sheet1 = app.add_sheet(sheet_name,cell_overwrite_ok=True) #创建sheetapp
for icol in range(len(excel_col)):
for jcol in range(0,len(excel_col[icol])):
sheet1.write(icol,jcol,excel_col[icol][jcol])
t=time.strftime('%Y-%m-%d_%H_%M_%S',time.localtime(time.time()))
t_path="C:/bz/"+str(city)+t+".xlsx"
app.save(t_path) #保存文件
################################抓取安居客楼盘价格#################################
################################读取excel数据#################################
# 获取一个Book对象
workbook = xlrd.open_workbook(t_path)
# 获取一个sheet对象的列表
sheets = workbook.sheets()
sheet_data = workbook.sheet_by_name(sheet_name)
cols=sheet_data.col_values(0)
title=sheet_data.row_values(0)
###数据标记和index
loupan_index=title.index(u'楼盘')
loupan_key=[]
area_index=title.index(u'区域')
area_key=[]
price_index=title.index(u'价格')
###每行数据
lines_data=[]
lines_data.append(title)
for icol in range(1,len(cols)):
rows = sheet_data.row_values(icol)
if rows[loupan_index] not in loupan_key:
loupan_key.append(rows[loupan_index])
lines_data.append(rows)
if rows[area_index] not in area_key:
area_key.append(rows[area_index])
###获取区域均价
area_price={}
for ikey in area_key:
tmp_price=0
tmp_count=0
for iline in lines_data:
if iline[area_index] == ikey:
tmp_count=tmp_count+1
tmp_price=tmp_price+int(iline[price_index])
if tmp_count != 0:
area_price[ikey]=int(tmp_price/tmp_count*1.0)
################################读取excel数据#################################
################################绘图#################################
#解决中文乱码问题
plt.rcParams['font.sans-serif'] = ['simHei']
plt.rcParams['axes.unicode_minus'] = False
avg_price_x=[]
avg_price_y=[]
for ikey in area_key:
avg_price_x.append(ikey)
avg_price_y.append(area_price[ikey])
width = 0.5 # the width of the bars
x = range(len(avg_price_x))
fig, ax = plt.subplots(figsize=(10*(len(avg_price_x)/10),6))
rects1 = ax.bar(x, avg_price_y, width, color='yellowgreen')
ax.set_title(sheet_name+'各个区域楼盘均价')
plt.ylabel(u"区域均价(元)")
plt.xticks(x, avg_price_x)
for rect in rects1:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width() / 2, height,'%d' % int(height),ha='center', va='bottom')
plt.show()
################################绘图#################################
3.后续会做界面出来,通过界面选择不同的城市,然后点按钮跳出该城市的房价信息。