python笔记：爬取上海地区新房房源数据明细及交叉汇总并保存为excel

本文链接：https://blog.csdn.net/bq_cui/article/details/93860666
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 11:07:46 2019

@author: User
"""

import re
import sys
from bs4 import BeautifulSoup       #beautifulsoup4库使用时是简写的bs4
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname='data\msyh.ttc')
#import string


header={  
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'  
}

i_pg = 0
lou_dtype=np.dtype([('lp_name',np.str_,50),
                    ('area',np.str_,50),
                     ('huxing',np.str_,50),
                     ('wuye_type',np.str_,50),
                     ('sale_state',np.str_,50),
                     ('price',np.int16)
                     ])

fram_all = pd.DataFrame(columns=['lp_name',
                                 'area1',
                                 'area2',
                                 'huxing',
                                 'wuye_type',
                                 'sale_state',
                                 'price'])
 
def get_page(url):
    print(url)
    
    global i_pg
    i_pg += 1
    print('页：', str(i_pg))
    
    if i_pg > 2:
        sys.exit()
        
    try:
        response = requests.get(url, timeout = 30, headers=header)
        
        # 如果状态码不是200 则应发HTTOError异常
        response.raise_for_status()
        # 设置正确的编码方式
        response.encoding = response.apparent_encoding

        soup = BeautifulSoup(response.text, 'html.parser')
        result_li = soup.find_all(class_=re.compile("item-mod"),
                                  attrs={"data-soj": re.compile("AF_RANK_\d+")})   

        
        j = 0
#        #处理当前页面房源链接
        for row_text in result_li:

#            这里每个列表页只处理前4个房源链接，是为了家加快调试
            j = j + 1
            
            print("-----------------------",j)
#            print(row_text)

            # 楼盘名称
            lp_name_str = row_text.find(class_=re.compile("items-name"))
            if lp_name_str:
                lp_name = lp_name_str.text.lstrip()
                print('楼盘名称：',lp_name)
            else:
                lp_name = ''
                
            # 楼盘位置
            area_str = row_text.find(class_=re.compile("list-map"))
            if area_str:
                area_str = my_strip(area_str.text)
                area_g = re.match("\[(.+)\]", area_str)
                area = area_g.group(0).replace("[ ", "").replace(" ]", "").replace(' ', ' ')
                area1 = area.split()[0]
                area2 = area.split()[1]
#                print('区位：', area1,'-',area2)
            else:
                area = ''
                
            
            # 户型
            huxing_str = row_text.find(class_=re.compile("huxing"))
            if huxing_str:
                huxing = my_strip(huxing_str.text)
                huxing = huxing.replace("户型：", "").replace("建筑面积：", "")
#                print('户型：',huxing) 
            else:
                huxing = ''
                
            # 物业类型（别墅 住宅...）
            wuye_type_str = row_text.find(class_=re.compile("status-icon wuyetp"))
            if wuye_type_str:
                wuye_type = my_strip(wuye_type_str.text)
#                print('物业类型：',wuye_type) 
            else:
                wuye_type = ''
                 
                
            # 销售状态
            sale_state_str = row_text.find(attrs={"class": re.compile("status-icon")})
            if sale_state_str:
                sale_state = my_strip(sale_state_str.text)
#                print('销售状态：',sale_state)  
            else:
                sale_state = ''
                
                
            # 均价
            price_str = row_text.find(class_=re.compile("price"))
            if price_str:
                ch_str2 = ['万元','每套','/套']
                price_is0 = False 
                for s in ch_str2:
                    if s in price_str.text: # 如果包含上面的汉字，则直接赋值 price=0
                        price = 0
                        price_is0 = True
#                        print('均价：',price)
                        
#                        price_str = '' #  如果包含上面的汉字，则后面直接不进行搜索
                
                if not price_is0:
                    price = price_str.find('span')
                    if price:
                        price = my_strip(price.text)
#                        print('均价：',price)
                    else:
                        price_str = row_text.find(class_=re.compile("favor-tag around-price"))
                        if price_str:
                            price = price_str.find('span')
                            if price:
                                price = my_strip(price.text)
#                                print('周边均价：',price) 
                            else:
                                price = '0'
#                                print('没有均价：',price)
                        else:
                            price = 0
#                            print('没有周边均价：',price)
                            
            frame_new = pd.DataFrame({'lp_name':lp_name,
                              'area1':area1,
                              'area2':area2,
                              'huxing':huxing,
                              'wuye_type':wuye_type,
                              'sale_state':sale_state,
                              'price':int(price)},
                            index=[1])   # 自定义索引为：1 ，这里也可以不设置index
            
            global fram_all
            fram_all = fram_all.append(frame_new)
            
                            
                            
        # 下一页的爬取
        result_next_page_str = soup.find_all('a', {'class': 'next-page next-link'})
        if result_next_page_str:
            result_next_page = result_next_page_str[0].attrs['href']
            get_page(result_next_page) 
        else:
            print('\n\n没有下一页了。任务完成！！')
        
        return response.text
    except:
        return '产生异常!'
 
 
#进行字符串中空格，换行，tab键的替换及删除字符串两边的空格删除 
def my_strip(s):
    return str(s).replace(" ", "").replace("\n", "").replace("\t", "").replace("&nbsp;", "").strip()

#由于频繁进行BeautifulSoup的使用，封装一下
def my_Beautifulsoup(response):
    return BeautifulSoup(str(response), 'html.parser')
 
# 详细页面的爬取
def get_page_detail(url):
    response = requests.get(url, headers=header)
#    print("9999999")
 
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
#        宗地面积
        area_all = soup.find(attrs={'id': 'j_id242'})
        area_2 = area_all.find_all('span', {'class': 'layout'})[1]
        global area
        area = area_2.text.lstrip().replace('平方米','',1)
        area = my_strip(area)
        if len(area) == 0:
            area = '0'        
#        print('面积:'+ area)      
  
# =============================================================================
        
print("by 老狼 2019.6.26")
print("QQ: 604631777  微信: cbq8848")

get_page('https://sh.fang.anjuke.com/loupan/all/')  # https://sh.fang.anjuke.com/?from=AF_Home_switchcity

fram_all.index = range(len(fram_all))
fram_all['price'] = fram_all['price'].astype('int') # float32 int
#print(fram_all.dtypes)

fram_all.to_excel("data/lou_detail.xlsx")
print("原始明细数据已经保存为:          data/lou_detail.xlsx")

print("对价格不为0且状态为在售数据进行汇总...")
#原始数据中，排除部分价格为0的数据行,只显示状态为 在售 的行
fr_part = fram_all[~fram_all['price'].isin([0]) & fram_all['sale_state'].isin(['在售'])]


crosstab = pd.pivot_table(fr_part,
                          index=['area1'],
                          values=['price'],
                          aggfunc=[np.mean, max, min],
                          fill_value=0,
                          margins=False)

crosstab.to_excel("data/lou_hui0.xlsx")
print('分区域1汇总保存为:              data/lou_hui0.xlsx')

##------------------------------
# 画图表
dict_area_price = dict(fr_part.groupby(['area1'])['price'].mean())
#print(dict_area_price)
# 排序操作
list1= sorted(dict_area_price.items(), key=lambda x:x[1], reverse = True)
#print(list1)
dict_area_price2 = dict(list1)
#print(dict_area_price2)

width = 0.5
idx = range(len(dict_area_price))
xt = dict_area_price2.keys()
yt = dict_area_price2.values()

#G=fr_all['area1'].unique()

plt.bar(idx, yt, width, align='center', color='lightblue')

plt.xticks(idx, xt, rotation=40)
plt.xlabel('区域')
plt.ylabel('房价平均值')
fig=plt.gcf()
fig.set_size_inches(15,6)
plt.show()
#------------------------

crosstab = pd.pivot_table(fr_part,
                          index=['area1'],
                          columns=['wuye_type'],
                          values=['price'],
                          aggfunc=[np.mean],
                          fill_value=0,
                          margins=False)
crosstab.to_excel("data/lou_hui1.xlsx")
print('分区域1、物业类型汇总保存为:      data/lou_hui1.xlsx')

crosstab = pd.pivot_table(fr_part,
                          index=['area1','area2'],
                          columns=['wuye_type'],
                          values=['price'],
                          aggfunc=[np.mean],
                          fill_value=0,
                          margins=False)
#print(crosstab)
crosstab.to_excel("data/lou_hui2.xlsx")
print('分区域1、区域2、物业类型汇总保存为:      data/lou_hui2.xlsx')