import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import math
import datetime
'''
查询上海链家二手房信息,根据区域-子区域查询,数量总计10万条
'''
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
'cookie': 'lianjia_uuid=2b975ea6-e554-4cbc-9baa-1e8775d04a69; _smt_uid=6624e264.6174c7a9; _ga=GA1.2.1386831987.1713693286; crosSdkDT2019DeviceId=71ix78-68j5ei-w2wgsvlp548xlfh-k93287al4; ftkrc_=0908d48b-cba8-4154-9087-5fa0a3293fee; lfrc_=0958722d-92ef-48a5-a213-0d89a8179118; _ga_NKBFZ7NGRV=GS1.2.1716900315.2.0.1716900315.0.0.0; _ga_XLL3Z3LPTW=GS1.2.1716900316.2.0.1716900316.0.0.0; _ga_E91JCCJY3Z=GS1.2.1716900478.1.0.1716900478.0.0.0; _ga_MFYNHLJT0H=GS1.2.1716900478.1.0.1716900478.0.0.0; _ga_KJTRWRHDL1=GS1.2.1716903734.1.0.1716903734.0.0.0; _ga_QJN1VP0CMS=GS1.2.1716903734.1.0.1716903734.0.0.0; _ga_00MKBBEWEN=GS1.2.1716903774.4.1.1716903809.0.0.0; Hm_lvt_efa595b768cc9dc7d7f9823368e795f1=1717158461; _ga_NKCLMZHBXY=GS1.2.1717981951.2.0.1717981951.0.0.0; _ga_TJZVFLS7KV=GS1.2.1717981921.3.1.1717982369.0.0.0; _ga_WLZSQZX7DE=GS1.2.1717981921.3.1.1717982369.0.0.0; _gid=GA1.2.1548988899.1719497450; select_city=310000; _jzqckmp=1; _jzqc=1; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1719101461,1719497447,1719580936,1719622451; lianjia_ssid=6e8e85bb-8b5f-4497-9a67-ae8235e24db0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218f00145a0a1685-0aa86e007e442a-4c657b58-2073600-18f00145a0b1efd%22%2C%22%24device_id%22%3A%2218f00145a0a1685-0aa86e007e442a-4c657b58-2073600-18f00145a0b1efd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _jzqa=1.264402294754856700.1713693285.1719644979.1719652495.33; _jzqx=1.1713693285.1719652495.15.jzqsr=cn%2Ebing%2Ecom|jzqct=/.jzqsr=sh%2Elianjia%2Ecom|jzqct=/ershoufang/; _ga_LRLL77SF11=GS1.2.1719652497.31.1.1719652629.0.0.0; _ga_GVYN2J1PCG=GS1.2.1719652497.31.1.1719652629.0.0.0; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1719654312; _jzqb=1.17.10.1719652495.1'}
# 查询链家二手房各区域页数
def getRegionPage(url):
pageList={}
# 查询二手房页面区域名
res=requests.get(url=url,headers=headers)
soup=BeautifulSoup(res.text,'lxml')
regionList = soup.find(attrs={'data-role': 'ershoufang'}).find_all('a')
# 根据区域查询子区域名
for i in regionList:
region = i.attrs['href'].replace('/', '').replace('ershoufang', '')
print('正在查询{}各区域页数...'.format(i.text))
url2 = url + region + '/'
res=requests.get(url=url2,headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
subRegionList = soup.find(attrs={'data-role': 'ershoufang'}).contents[-2].find_all('a')
# 根据子区域查询页数
for k in subRegionList:
subRegion = k.attrs['href'].replace('/', '').replace('ershoufang', '')
url3=url+subRegion+'/'
res = requests.get(url=url3, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
# 获取页数
totalNum = soup.find(attrs={'class': 'leftContent'}).contents[-1].find('p').next_sibling.text
pageNum = math.ceil(int(totalNum) / 30)
print('{}-{}共有{}页数据!'.format(i.text,k.text,pageNum))
pageList[subRegion]=pageNum
time.sleep(random.random() * 3)
return pageList
# 查询每页数据(房屋信息)
def getDatas(url,region):
dataList=[]
res=requests.get(url=url,headers=headers)
soup=BeautifulSoup(res.text,'lxml')
div = soup.find_all(attrs={'class': 'info clear'})
# 查询各字段值
for k in div:
result = {}
result['url'] = k.find(attrs={'class': 'title'}).find('a').attrs['href']
result['标题'] = k.find(attrs={'class': 'title'}).text
result['小区'] = k.find(attrs={'class': 'positionInfo'}).find_all('a')[0].text
result['区域'] = k.find(attrs={'class': 'positionInfo'}).find_all('a')[1].text
result['房屋信息'] = k.find(attrs={'class': 'houseInfo'}).text
tag = k.find(attrs={'class': 'tag'}).find_all('span')
result['标签'] = ''
for i in tag:
result['标签'] = result['标签'] + ',' + i.text
result['总价'] = k.find(attrs={'class': 'totalPrice totalPrice2'}).text
result['单价'] = k.find(attrs={'class': 'unitPrice'}).text
dataList.append(result)
print('{}提取成功!'.format(url))
time.sleep(random.random() * 3)
return dataList
# 文件去重并拆分房屋信息列
def dataAdjust(path):
df = pd.read_csv(path)
# 根据url去重
df = df.drop_duplicates(subset=['url'], keep='first')
# 将房屋信息分列为单独列
df[['房型','面积','朝向','装修','楼层','建筑年代','类型']] = df['房屋信息'].str.split("|", expand=True).iloc[:, :-1]
df.drop(axis=1, columns='房屋信息', inplace=True)
# 筛选出建筑年代和类型错位数据,并更新值
condition = (df['建筑年代'].str.contains('年') == False) & (df['建筑年代'].str.contains('暂无数据') == False)
df.loc[condition, '类型'] = df.loc[condition]['建筑年代']
df.loc[condition, '建筑年代'] = '暂无数据'
# 输出
df.to_csv(path, index=False)
if __name__ == '__main__':
# 记录程序开始时间
starTime=datetime.datetime.now()
# 新建文件,并保存列名
df=pd.DataFrame(data=None,columns=['url', '标题', '小区', '区域', '房屋信息', '标签', '总价', '单价'])
path='data_lianjia_'+datetime.datetime.now().strftime('%m%d%H%M')+'.csv'
df.to_csv(path,index=False)
# 根据二手房主页面查询各子区域页数
url = 'https://sh.lianjia.com/ershoufang/'
pageList=getRegionPage(url)
# 遍历各区域,并查询每页信息
for i in pageList.keys():
# 如果页数大于100,修改值为100
if pageList[i]>100:
pageList[i]=100
# 查询每页信息,并追加写入本地文件
for j in range(1,pageList[i]+1):
url2='https://sh.lianjia.com/ershoufang/%s/pg%s/'%(i,j)
dataList=getDatas(url2,i)
# 追加写入本地文件
df=pd.DataFrame(dataList)
df.to_csv(path, mode='a', header=False, index=False)
# 文件去重并拆分房屋信息列
dataAdjust(path)
# 记录程序结束时间
endTime=datetime.datetime.now()
# 输出程序运行时间
print('程序运行时间:{}'.format(endTime-starTime))
06-23
8266
07-02
3747
07-22
1182