python二手房使用教程_python链家二手房爬虫实战

最新推荐文章于 2024-06-10 08:15:00 发布

生命的光彩

最新推荐文章于 2024-06-10 08:15:00 发布

阅读量441

点赞数 1

文章标签： python二手房使用教程

本文链接：https://blog.csdn.net/weixin_29258297/article/details/111917745

版权

本文介绍了如何使用Python进行链家二手房数据的爬取，包括设置请求头、循环抓取页面、解析HTML获取房源总价、信息、关注度和位置等数据，并进行了数据处理，如提取小区、户型、面积、朝向、装修情况、关注数等字段，最后计算了单位价格。

摘要由CSDN通过智能技术生成

import requests

import time

from bs4 import BeautifulSoup

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib as mpl

import matplotlib.pyplot as plt

from IPython.display import display

plt.style.use("fivethirtyeight")

sns.set_style({'font.sans-serif':['simhei','Arial']})

%matplotlib inline

from matplotlib.font_manager import FontProperties

font_zh = FontProperties(fname='/Library/Fonts/SimHei.ttc')

#设置列表页URL的固定部分

url='http://hz.lianjia.com/ershoufang/'

#设置页面页的可变部分

page=('pg')

#设置请求头部信息

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',

'Accept':'text/html;q=0.9,*/*;q=0.8',

'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',

'Accept-Encoding':'gzip',

'Connection':'close',

'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&eqid=c3435a7d00006bd600000003582bfd1f'

}

#循环抓取列表页信息

for i in range(1,100):

if i == 1:

i=str(i)

a=(url+page+i+'/')

r=requests.get(url=a,headers=headers)

html=r.content

else:

i=str(i)

a=(url+page+i+'/')

r=requests.get(url=a,headers=headers)

html2=r.content

html = html + html2

#每次间隔0.5秒

time.sleep(0.5)

print(html)

lj=BeautifulSoup(html,'html.parser')

提取相关数据，之前在网上看过教程，但到实际爬取的时候，老是报错，原来是链家改版了，相应的代码也需要改

#提取房源总价

price=lj.find_all('div',attrs={'class':'priceInfo'})

tp=[]

for a in price:

totalPrice=a.span.string

tp.append(totalPrice)

#提取房源信息

houseInfo=lj.find_all('div',{'class':'houseInfo'})

hi=[]

for b in houseInfo:

house=b.get_text()

hi.append(house)

#提取房源关注度

followInfo=lj.find_all('div',attrs={'class':'followInfo'})

fi=[]

for c in followInfo:

follow=c.get_text()

fi.append(follow)

#提取位置信息

positionInfo = lj.find_all('div',attrs = {'class':'positionInfo'})

pi=[]

for p in positionInfo:

position = p.get_text()

pi.append(position)

提取后进行数据处理

#导入pandas库

import pandas as pd

#创建数据表

house=pd.DataFrame({'totalprice':tp,'houseinfo':hi,'followinfo':fi,'positioninfo':pi})

#查看数据表的内容

house.head()

house['xiaoqu']=house['houseinfo'].apply(lambda x : x.split('|')[0].strip())

house['huxing']=house['houseinfo'].apply(lambda x : x.split('|')[1].strip())

house['mianji']=house['houseinfo'].apply(lambda x : x.split('|')[2].strip())

house['chaoxiang']=house['houseinfo'].apply(lambda x : x.split('|')[3].strip())

house['zhuangxiu']=house['houseinfo'].apply(lambda x : x.split('|')[4].strip())

#house['dianti']=house['houseinfo'].apply(lambda x : x.split('|')[5].strip())

house['guanzhu']=house['followinfo'].apply(lambda x : x.split('/')[0].strip())

house['shijian']=house['followinfo'].apply(lambda x : x.split('/')[2].strip())

house['lougao']=house['positioninfo'].apply(lambda x:x.split('-')[0].strip())

house['weizhi']=house['positioninfo'].apply(lambda x:x.split('-')[1].strip())

import re

def get_num(string):

return(re.findall("\d+\.?\d*",string)[0])

house['mianji_num']=house.mianji.apply(get_num)

house['mianji_num']=house['mianji_num'].astype(float)

house.head()

house['guanzhu_num']=house.guanzhu.apply(get_num)

house.head()

house[['guanzhu_num','totalprice']]=house[['guanzhu_num','totalprice']].astype(float)

house.head()

huxing=house.groupby(['huxing'])['huxing'].agg('count')

house["danjia"]=(house['totalprice']/house['mianji_num'])

house.head()

生命的光彩

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python二手房使用教程_python链家二手房爬虫实战

import requestsimport timefrom bs4 import BeautifulSoupimport pandas as pdimport numpy as npimport seaborn as snsimport matplotlib as mplimport matplotlib.pyplot as pltfrom IPython.display import disp...
复制链接

扫一扫