# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 11:07:46 2019
@author: User
"""
import re
import sys
from bs4 import BeautifulSoup #beautifulsoup4库使用时是简写的bs4
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname='data\msyh.ttc')
#import string
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
i_pg = 0
lou_dtype=np.dtype([('lp_name',np.str_,50),
('area',np.str_,50),
('huxing',np.str_,50),
('wuye_type',np.str_,50),
('sale_state',np.str_,50),
('price',np.int16)
])
fram_all = pd.DataFrame(columns=['lp_name',
'area1',
'area2',
'huxing',
'wuye_type',
'sale_state',
'price'])
def get_page(url):
print(url)
global i_pg
i_pg += 1
print('页:', str(i_pg))
if i_pg > 2:
sys.exit()
try:
response = requests.get(url, timeout = 30, headers=header)
# 如果状态码不是200 则应发HTTOError异常
response.raise_for_status()
# 设置正确的编码方式
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
result_li = soup.find_all(class_=re.compile("item-mod"),
attrs={"data-soj": re.compile("AF_RANK_\d+")})
j = 0
# #处理当前页面房源链接
for row_text in result_li:
# 这里每个列表页只处理前4个房源链接,是为了家加快调试
j = j + 1
print("-----------------------",j)
# print(row_text)
# 楼盘名称
lp_name_str = row_text.find(class_=re.compile("items-name"))
if lp_name_str:
lp_name = lp_name_str.text.lstrip()
print('楼盘名称:',lp_name)
else:
lp_name = ''
# 楼盘位置
area_str = row_text.find(class_=re.compile("list-map"))
if area_str:
area_str = my_strip(area_str.text)
area_g = re.match("\[(.+)\]", area_str)
area = area_g.group(0).replace("[ ", "").replace(" ]", "").replace(' ', ' ')
area1 = area.split()[0]
area2 = area.split()[1]
# print('区位:', area1,'-',area2)
else:
area = ''
# 户型
huxing_str = row_text.find(class_=re.compile("huxing"))
if huxing_str:
huxing = my_strip(huxing_str.text)
huxing = huxing.replace("户型:", "").replace("建筑面积:", "")
# print('户型:',huxing)
else:
huxing = ''
# 物业类型(别墅 住宅...)
wuye_type_str = row_text.find(class_=re.compile("status-icon wuyetp"))
if wuye_type_str:
wuye_type = my_strip(wuye_type_str.text)
# print('物业类型:',wuye_type)
else:
wuye_type = ''
# 销售状态
sale_state_str = row_text.find(attrs={"class": re.compile("status-icon")})
if sale_state_str:
sale_state = my_strip(sale_state_str.text)
# print('销售状态:',sale_state)
else:
sale_state = ''
# 均价
price_str = row_text.find(class_=re.compile("price"))
if price_str:
ch_str2 = ['万元','每套','/套']
price_is0 = False
for s in ch_str2:
if s in price_str.text: # 如果包含上面的汉字,则直接赋值 price=0
price = 0
price_is0 = True
# print('均价:',price)
# price_str = '' # 如果包含上面的汉字,则后面直接不进行搜索
if not price_is0:
price = price_str.find('span')
if price:
price = my_strip(price.text)
# print('均价:',price)
else:
price_str = row_text.find(class_=re.compile("favor-tag around-price"))
if price_str:
price = price_str.find('span')
if price:
price = my_strip(price.text)
# print('周边均价:',price)
else:
price = '0'
# print('没有均价:',price)
else:
price = 0
# print('没有周边均价:',price)
frame_new = pd.DataFrame({'lp_name':lp_name,
'area1':area1,
'area2':area2,
'huxing':huxing,
'wuye_type':wuye_type,
'sale_state':sale_state,
'price':int(price)},
index=[1]) # 自定义索引为:1 ,这里也可以不设置index
global fram_all
fram_all = fram_all.append(frame_new)
# 下一页的爬取
result_next_page_str = soup.find_all('a', {'class': 'next-page next-link'})
if result_next_page_str:
result_next_page = result_next_page_str[0].attrs['href']
get_page(result_next_page)
else:
print('\n\n没有下一页了。任务完成!!')
return response.text
except:
return '产生异常!'
#进行字符串中空格,换行,tab键的替换及删除字符串两边的空格删除
def my_strip(s):
return str(s).replace(" ", "").replace("\n", "").replace("\t", "").replace(" ", "").strip()
#由于频繁进行BeautifulSoup的使用,封装一下
def my_Beautifulsoup(response):
return BeautifulSoup(str(response), 'html.parser')
# 详细页面的爬取
def get_page_detail(url):
response = requests.get(url, headers=header)
# print("9999999")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 宗地面积
area_all = soup.find(attrs={'id': 'j_id242'})
area_2 = area_all.find_all('span', {'class': 'layout'})[1]
global area
area = area_2.text.lstrip().replace('平方米','',1)
area = my_strip(area)
if len(area) == 0:
area = '0'
# print('面积:'+ area)
# =============================================================================
print("by 老狼 2019.6.26")
print("QQ: 604631777 微信: cbq8848")
get_page('https://sh.fang.anjuke.com/loupan/all/') # https://sh.fang.anjuke.com/?from=AF_Home_switchcity
fram_all.index = range(len(fram_all))
fram_all['price'] = fram_all['price'].astype('int') # float32 int
#print(fram_all.dtypes)
fram_all.to_excel("data/lou_detail.xlsx")
print("原始明细数据已经保存为: data/lou_detail.xlsx")
print("对价格不为0且状态为在售数据进行汇总...")
#原始数据中,排除部分价格为0的数据行,只显示状态为 在售 的行
fr_part = fram_all[~fram_all['price'].isin([0]) & fram_all['sale_state'].isin(['在售'])]
crosstab = pd.pivot_table(fr_part,
index=['area1'],
values=['price'],
aggfunc=[np.mean, max, min],
fill_value=0,
margins=False)
crosstab.to_excel("data/lou_hui0.xlsx")
print('分区域1汇总保存为: data/lou_hui0.xlsx')
##------------------------------
# 画图表
dict_area_price = dict(fr_part.groupby(['area1'])['price'].mean())
#print(dict_area_price)
# 排序操作
list1= sorted(dict_area_price.items(), key=lambda x:x[1], reverse = True)
#print(list1)
dict_area_price2 = dict(list1)
#print(dict_area_price2)
width = 0.5
idx = range(len(dict_area_price))
xt = dict_area_price2.keys()
yt = dict_area_price2.values()
#G=fr_all['area1'].unique()
plt.bar(idx, yt, width, align='center', color='lightblue')
plt.xticks(idx, xt, rotation=40)
plt.xlabel('区域')
plt.ylabel('房价平均值')
fig=plt.gcf()
fig.set_size_inches(15,6)
plt.show()
#------------------------
crosstab = pd.pivot_table(fr_part,
index=['area1'],
columns=['wuye_type'],
values=['price'],
aggfunc=[np.mean],
fill_value=0,
margins=False)
crosstab.to_excel("data/lou_hui1.xlsx")
print('分区域1、物业类型汇总保存为: data/lou_hui1.xlsx')
crosstab = pd.pivot_table(fr_part,
index=['area1','area2'],
columns=['wuye_type'],
values=['price'],
aggfunc=[np.mean],
fill_value=0,
margins=False)
#print(crosstab)
crosstab.to_excel("data/lou_hui2.xlsx")
print('分区域1、区域2、物业类型汇总保存为: data/lou_hui2.xlsx')