目录
1、准备工作
网址:某家
IDE:pycharm2019
Python:3.7
代码git:代码地址
2、分析网页
准备工作做好之后(也没啥准备的,哈哈),用谷歌打开网址,按F12
看下网址,好像是根据<li>标签的data-district-spell属性进行拼接的,我们需要获取上海所有地区,然后拼接其各个网址。到了这里感觉是不是少了什么(〃'▽'〃),是不是还少了页数。可能是因为第一页的缘故,所以没显示页数,接下来我们点一下第二页来验证一下
果然,网址中出现了pg2的字样,接下来只要找到房子对应的节点获取其房价就可以了。这个简单,就直接查看就行了
桥豆麻袋,好像有点不对啊,这里显示了100页,却没有那么多啊。没事,这里我们就用标签个数判断,标签个数为0就到结尾了
3、代码实现
效果图:
代码如下:
import requests;
from lxml import etree;
import time;
import matplotlib.pyplot as plt;
base_url = "https://sh.fang.lianjia.com/loupan/";
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64'
}
ip_dict = {"https": "103.103.3.6:8080"}
# requests.get(base_url,proxies={'https': ip_dict},headers=header)
class AreaInfo:
def __init__(self, name, code, average_price=0.0, sum_price=0.0):
self.code = code
self.name = name
self.average_price = average_price
self.sum_price = sum_price
def set_average_price(self, average_price):
self.average_price = average_price;
def set_sum_price(self, sum_price):
self.sum_price = sum_price;
def get_code(self):
return self.code;
def get_name(self):
return self.name;
def get_average_price(self):
return self.average_price;
def __str__(self):
return "\n地区: %s " \
"\t 房价总和: %s " \
"\t 房价均值:%s" % (self.name, "{:,}".format(self.sum_price), "{:,}".format(self.average_price))
"""
绘制柱状图
"""
def draw(area_infos):
name_list = [];
num_list = [];
for area in area_infos:
name_list.append(area.get_name());
num_list.append(area.get_average_price());
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
plt.title("上海平均房价统计")
plt.barh(range(len(num_list)), num_list, tick_label=name_list)
plt.show()
"""
控制台打印结果
"""
def print_house(area_infos):
for area in area_infos:
print(area);
"""
计算房价结果
"""
def calc_house_price(area_infos):
# https://sh.fang.lianjia.com/loupan/jingan/pg2/#jingan
for area in area_infos:
sum = 0.0;
count = 0;
uncertain = 0;
page_index = 1;
code = area.get_code();
while True:
# 暂停一段时间,比较访问频繁
time.sleep(20);
url = base_url + code + "/pg" + str(page_index) + "/#" + code;
html_str = requests.get(url).text
html = etree.HTML(html_str);
count = int(html.xpath("/html/body/div[3]/div[2]/div/span[2]/text()")[0]);
if count > 0:
house_list = html.cssselect('.main-price > .number');
if len(house_list) != 0:
for house in house_list:
price = 0.0;
try:
price = float(house.text);
except ValueError as e:
# 统计待定区域的房子
uncertain += 1;
sum += price;
else:
break;
# 页数加1
page_index += 1;
else:
count = 0;
break;
area.set_sum_price(round(sum, 2));
# 总数减去未定的个数
count = count - uncertain;
if count > 0:
area.set_average_price(round(sum / count, 2));
if __name__ == '__main__':
html_str = requests.get(url=base_url, headers=header).text # Get方式获取网页数据
html = etree.HTML(html_str);
# 获取区域,使用xpath
area_list = html.xpath("/html/body/div[2]/div[2]/ul/li");
area_infos = [];
for area in area_list:
area_infos.append(AreaInfo(area.text, area.attrib.get("data-district-spell")));
calc_house_price(area_infos);
print_house(area_infos);
draw(area_infos);
4、总结
由于这个网址有反爬机制,所以太频繁会这样(╥╯^╰╥):
刚开始学Python,写的不好还望见谅(✪ω✪)