python二手房数据分析_使用python3抓取链家二手房数据

最新推荐文章于 2024-06-17 18:27:30 发布

溴化银

最新推荐文章于 2024-06-17 18:27:30 发布

阅读量783

点赞数

文章标签： python二手房数据分析

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_42500367/article/details/113653084

版权

import requests

from bs4 import BeautifulSoup

import sys

import os

import time

import pandas as pd

import numpy as np

from parsel import Selector

import re

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'

}

def catchHouseList(url):

resp = requests.get(url, headers=headers, stream=True)

if resp.status_code == 200:

reg = re.compile('

.*?

urls = re.findall(reg, resp.text)

return urls

return []

def catchHouseDetail(url):

resp = requests.get(url, headers=headers)

print(url)

if resp.status_code == 200:

info = {}

soup = BeautifulSoup(resp.text, 'html.parser')

info['标题'] = soup.select('.main')[0].text

info['总价'] = soup.select('.total')[0].text

info['总价单位'] = soup.select('.unit')[0].text

info['每平方售价'] = soup.select('.unitPriceValue')[0].text

# p = soup.select('.tax')

# info['参考总价'] = soup.select('.tax')[0].text

info['建造时间'] = soup.select('.subInfo')[2].text

info['小区名称'] = soup.select('.info')[0].text

info['所在区域'] = soup.select('.info a')[0].text + ':' + soup.select('.info a')[1].text

info['链家编号'] = str(url)[34:].rsplit('.html')[0]

info['房屋户型'] = str(soup.select('.content')[2].select('.label')[0].next_sibling)

info['所在楼层'] = soup.select('.content')[2].select('.label')[1].next_sibling

info['建筑面积'] = soup.select('.content')[2].select('.label')[2].next_sibling

info['户型结构'] = soup.select('.content')[2].select('.label')[3].next_sibling

info['套内面积'] = soup.select('.content')[2].select('.label')[4].next_sibling

info['建筑类型'] = soup.select('.content')[2].select('.label')[5].next_sibling

info['房屋朝向'] = soup.select('.content')[2].select('.label')[6].next_sibling

info['建筑结构'] = soup.select('.content')[2].select('.label')[7].next_sibling

info['装修情况'] = soup.select('.content')[2].select('.label')[8].next_sibling

info['梯户比例'] = soup.select('.content')[2].select('.label')[9].next_sibling

info['供暖方式'] = soup.select('.content')[2].select('.label')[10].next_sibling

info['配备电梯'] = soup.select('.content')[2].select('.label')[11].next_sibling

# info['产权年限'] = str(soup.select('.content')[2].select('.label')[12].next_sibling)

return info

pass

def appendToXlsx(info):

fileName = './链家二手房.xlsx'

dfNew = pd.DataFrame([info])

if(os.path.exists(fileName)):

sheet = pd.read_excel(fileName)

dfOld = pd.DataFrame(sheet)

df = pd.concat([dfOld, dfNew])

df.to_excel(fileName)

else:

dfNew.to_excel(fileName)

def catch():

pages = ['https://sz.lianjia.com/ershoufang/pg{}/'.format(x) for x in range(1, 1001)]

for page in pages:

print(page)

houseListURLs = catchHouseList(page)

for houseDetailUrl in houseListURLs:

try:

info = catchHouseDetail(houseDetailUrl)

appendToXlsx(info)

except:

pass

time.sleep(3)

pass

if __name__ == '__main__':

catch()

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python二手房数据分析_使用python3抓取链家二手房数据

import requestsfrom bs4 import BeautifulSoupimport sysimport osimport timeimport pandas as pdimport numpy as npfrom parsel import Selectorimport reheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10....
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。