python_安居客区域房源均价工具(matplotlib)

让数据更有价值,就需要对抓取的信息进行适当的处理,然后展现出来。

0.打开源码,修改源码该位置选择城市:

1.抓取的安居客不团区域房价,然后计算该区域均价,然后通过matploylib绘图:

2.python源码:

#-*- encoding=UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import xlwt
import time
import xlrd
import matplotlib.pyplot as plt

city="xa"     ###城市缩写
sheet_name="西安"
url_area="https://"+str(city)+".fang.anjuke.com/loupan/"

###各个title信息
area_key=r"<a href=https://"+str(city)+".fang.anjuke.com/loupan/"
address_key="<span class=\"list-map\" target=\"_blank\">"
huxing_key="huxing"
url_key="class=\"tags-wrap\" href=\""
panel_key="class=\"tag-panel\""
price_key="class=\"price\""
price_around_key="\"favor-tag around-price\""
tel_key="class=\"tel\">"

###区域
loupan_area=[]
###楼盘名
loupan_title=[]
###楼盘地址
loupan_address=[]
###楼盘户型
loupan_huxing=[]
###楼盘URL
loupan_url=[]
###楼盘panel
loupan_panel=[]
###楼盘price
loupan_price=[]
###楼盘tel
loupan_tel=[]

################################抓取安居客楼盘价格#################################
###获取所有区域
anjuke_area = requests.get(url_area).text
anjuke_area=anjuke_area.split("\n")
area_loupan={}
for ihtml in anjuke_area:
    if area_key in ihtml:
        area_loupan[ihtml.split(">")[1].split("<")[0]]=ihtml.split("a href=")[1].split(">")[0]+"p"

for ikey in area_loupan.keys():
    if ikey[-1] == "线":
        continue
    print(ikey)
    inum = 0
    sheet_count = []
    while 1:
        real_url = area_loupan[ikey] + str(inum + 1) + "w1_/"
        inum=inum+1
        html = urlopen(real_url)
        anjuke_html = BeautifulSoup(html.read())
        ###得到原始信息
        title_key_start = "<span class=\"items-name\">"
        loupan_arr = []
        for ihtml in anjuke_html:
            data = str(ihtml).split(title_key_start)
            if len(data) > 1:
                for i in data:
                    loupan_arr.append(i)
        ###筛选原始信息
        title_key_stop="<!--邮箱订阅-->"
        dest_loupan_arr = []
        for i in range(len(loupan_arr)):
            if i != 0 and i != len(loupan_arr) - 1:
                dest_loupan_arr.append(loupan_arr[i])
            if i == len(loupan_arr) - 1:
                data = str(loupan_arr[i]).split(title_key_stop)
                dest_loupan_arr.append(data[0])
        ###解析网页
        for i in dest_loupan_arr:
            price_flag = 0  ##部分楼盘售价待定
            huxing_flag = 0  ##部分楼盘户型未知
            tel_flag = 0  ##部分楼盘电环未知
            data = str(i).split("\n")
            for j in range(len(data)):
                if j == 0:  # loupan_key
                    loupan_title.append(data[j].split("<")[0])
                    continue
                if address_key in data[j]:
                    loupan_address.append(data[j].split(address_key)[1].split("<")[0])
                    continue
                if huxing_key in data[j]:
                    huxing_flag = 1
                    real_j = j + 1
                    tmp_huxing_str = ""
                    while data[real_j] != "</a>":
                        if "<span>" in data[real_j]:
                            tmp = data[real_j].split("<span>")
                            for it in tmp:
                                if "<" in it:
                                    t = it.split("<")[0]
                                    if "建筑面积" in t:
                                        tmp_huxing_str = tmp_huxing_str.rstrip("/") + " "
                                        tmp_huxing_str = tmp_huxing_str + t
                                    else:
                                        tmp_huxing_str = tmp_huxing_str + t + "/"
                        else:
                            tmp_huxing_str = tmp_huxing_str + data[real_j].strip()
                        real_j = real_j + 1
                    loupan_huxing.append(tmp_huxing_str)
                    continue
                if url_key in data[j]:
                    loupan_url.append(data[j].split(url_key)[1].split("\"")[0])
                    continue
                if panel_key in data[j]:
                    real_j = j + 1
                    tmp_panel_str = ""
                    while data[real_j] != "</div>":
                        tmp_panel_str = tmp_panel_str + data[real_j].split(">")[1].split("<")[0] + " "
                        real_j = real_j + 1
                    loupan_panel.append(tmp_panel_str.strip())
                    continue
                if price_key in data[j]:
                    price_flag = 1
                    tmp = data[j].split(">")
                    tmp_price_str = ""
                    for it in tmp:
                        tmp_price_str = tmp_price_str + it.split("<")[0]
                    loupan_price.append(tmp_price_str)
                    continue
                if price_around_key in data[j]:
                    price_flag = 1
                    real_j = j + 1
                    tmp = data[real_j].split(">")
                    tmp_price_str = ""
                    for it in tmp:
                        tmp_price_str = tmp_price_str + it.split("<")[0].strip()
                    loupan_price.append(tmp_price_str)
                    continue
                if tel_key in data[j]:
                    tel_flag = 1
                    loupan_tel.append(data[j].split(tel_key)[1].split("<")[0])
                    continue
            if price_flag == 0:
                loupan_price.append("售价待定")
            if huxing_flag == 0:
                loupan_huxing.append("户型未知")
            if tel_flag == 0:
                loupan_tel.append("号码未知")
            loupan_area.append(ikey)
        if sheet_count != [] and sheet_count[-1] != len(dest_loupan_arr):
            break
        sheet_count.append(len(dest_loupan_arr))

excel_col=[]
excel_col.append([u'楼盘',u'价格',u'区域',u'户型',u'地址',u'状态',u'网址',u'电话'])
for icol in range(len(loupan_title)):
    if "套" in loupan_price[icol]:
        continue
    if "套" not in loupan_price[icol]:
        tmp_price = ""
        for i in loupan_price[icol]:
            if str(i) >= '0' and str(i) <= '9':
                tmp_price=tmp_price+i
        if tmp_price != "":
            loupan_price[icol]=int(tmp_price)
        else:
            continue
    tmp=[loupan_title[icol],loupan_price[icol],loupan_area[icol],loupan_huxing[icol],loupan_address[icol],loupan_panel[icol],loupan_url[icol],loupan_tel[icol]]
    excel_col.append(tmp)

####写入excel
app = xlwt.Workbook() #创建工作簿
sheet1 = app.add_sheet(sheet_name,cell_overwrite_ok=True) #创建sheetapp

for icol in range(len(excel_col)):
    for jcol in range(0,len(excel_col[icol])):
        sheet1.write(icol,jcol,excel_col[icol][jcol])
t=time.strftime('%Y-%m-%d_%H_%M_%S',time.localtime(time.time()))
t_path="C:/bz/"+str(city)+t+".xlsx"
app.save(t_path) #保存文件
################################抓取安居客楼盘价格#################################

################################读取excel数据#################################
# 获取一个Book对象
workbook = xlrd.open_workbook(t_path)
# 获取一个sheet对象的列表
sheets = workbook.sheets()

sheet_data = workbook.sheet_by_name(sheet_name)
cols=sheet_data.col_values(0)
title=sheet_data.row_values(0)

###数据标记和index
loupan_index=title.index(u'楼盘')
loupan_key=[]
area_index=title.index(u'区域')
area_key=[]
price_index=title.index(u'价格')
###每行数据
lines_data=[]
lines_data.append(title)
for icol in range(1,len(cols)):
    rows = sheet_data.row_values(icol)
    if rows[loupan_index] not in loupan_key:
        loupan_key.append(rows[loupan_index])
        lines_data.append(rows)
    if rows[area_index] not in area_key:
        area_key.append(rows[area_index])

###获取区域均价
area_price={}
for ikey in area_key:
    tmp_price=0
    tmp_count=0
    for iline in lines_data:
        if iline[area_index] == ikey:
            tmp_count=tmp_count+1
            tmp_price=tmp_price+int(iline[price_index])
    if tmp_count != 0:
        area_price[ikey]=int(tmp_price/tmp_count*1.0)
################################读取excel数据#################################

################################绘图#################################
#解决中文乱码问题
plt.rcParams['font.sans-serif'] = ['simHei']
plt.rcParams['axes.unicode_minus'] = False

avg_price_x=[]
avg_price_y=[]
for ikey in area_key:
    avg_price_x.append(ikey)
    avg_price_y.append(area_price[ikey])

width = 0.5  # the width of the bars
x = range(len(avg_price_x))
fig, ax = plt.subplots(figsize=(10*(len(avg_price_x)/10),6))
rects1 = ax.bar(x, avg_price_y, width, color='yellowgreen')
ax.set_title(sheet_name+'各个区域楼盘均价')
plt.ylabel(u"区域均价(元)")
plt.xticks(x, avg_price_x)

for rect in rects1:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height,'%d' % int(height),ha='center', va='bottom')

plt.show()
################################绘图#################################
3.后续会做界面出来,通过界面选择不同的城市,然后点按钮跳出该城市的房价信息。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值