2024年最全小牧用Python 爬取数万条房产数据,揭秘一线城市生存压力有多大

for div in divs:

ps = div.find_all(“p”)

try: # 捕获异常,因为页面中有些数据没有被填写完整,或者被插入了一条广告,则会没有相应的标签,所以会报错

for index, p in enumerate(ps): # 从源码中可以看出,每一条 p 标签都有我们想要的信息,故在此遍历 p 标签,

text = p.text.strip()

print(text) # 输出看看是否为我们想要的信息

print(“===================================”)

爬取并存进 MongoDB 数据库

roomMsg = ps[1].text.split(“|”)

rentMsg 这样处理是因为有些信息未填写完整,导致对象报空

area = roomMsg[2].strip()[:len(roomMsg[2]) - 2]

rentMsg = self.getRentMsg(

ps[0].text.strip(),

roomMsg[1].strip(),

int(float(area)),

int(ps[len(ps) - 1].text.strip()[:len(ps[len(ps) - 1].text.strip()) - 3]),

ps[2].text.strip(),

ps[3].text.strip(),

ps[2].text.strip()[:2],

roomMsg[3],

)

rent.insert(rentMsg)

except:

continue

数据分析:

求一个区的房租单价(平方米/元)

def getAvgPrice(self, region):

areaPinYin = self.getPinyin(region=region)

collection = self.zfdb[areaPinYin]

totalPrice = collection.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'region’, ‘total_price’: {‘ s u m ′ : ′ sum': ' sum:price’}}}])

totalArea = collection.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'region’, ‘total_area’: {‘ s u m ′ : ′ sum': ' sum:area’}}}])

totalPrice2 = list(totalPrice)[0][“total_price”]

totalArea2 = list(totalArea)[0][“total_area”]

return totalPrice2 / totalArea2

获取各个区 每个月一平方米需要多少钱

def getTotalAvgPrice(self):

totalAvgPriceList = []

totalAvgPriceDirList = []

for index, region in enumerate(self.getAreaList()):

avgPrice = self.getAvgPrice(region)

totalAvgPriceList.append(round(avgPrice, 3))

totalAvgPriceDirList.append({“value”: round(avgPrice, 3), “name”: region + " " + str(round(avgPrice, 3))})

return totalAvgPriceDirList

获取各个区 每一天一平方米需要多少钱

def getTotalAvgPricePerDay(self):

totalAvgPriceList = []

for index, region in enumerate(self.getAreaList()):

avgPrice = self.getAvgPrice(region)

totalAvgPriceList.append(round(avgPrice / 30, 3))

return (self.getAreaList(), totalAvgPriceList)

获取各区统计样本数量

def getAnalycisNum(self):

analycisList = []

for index, region in enumerate(self.getAreaList()):

collection = self.zfdb[self.pinyinDir[region]]

print(region)

totalNum = collection.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'total_num': {'sum’: 1}}}])

totalNum2 = list(totalNum)[0][“total_num”]

analycisList.append(totalNum2)

return (self.getAreaList(), analycisList)

获取各个区的房源比重

def getAreaWeight(self):

result = self.zfdb.rent.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'region’, ‘weight’: {‘$sum’: 1}}}])

areaName = []

areaWeight = []

for item in result:

if item[“_id”] in self.getAreaList():

areaWeight.append(item[“weight”])

areaName.append(item[“_id”])

print(item[“_id”])

print(item[“weight”])

print(type(item))

return (areaName, areaWeight)

获取 title 数据,用于构建词云

def getTitle(self):

collection = self.zfdb[“rent”]

queryArgs = {}

projectionFields = {‘_id’: False, ‘title’: True} # 用字典指定需要的字段

searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)

content = ‘’

for result in searchRes:

print(result[“title”])

content += result[“title”]

return content

获取户型数据(例如:3 室 2 厅)

def getRooms(self):

results = self.zfdb.rent.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'rooms’, ‘weight’: {‘$sum’: 1}}}])

roomList = []

weightList = []

for result in results:

roomList.append(result[“_id”])

weightList.append(result[“weight”])

print(list(result))

return (roomList, weightList)

获取租房面积

def getAcreage(self):

results0_30 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 0, ‘$lte’: 30}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results30_60 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 30, ‘$lte’: 60}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results60_90 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 60, ‘$lte’: 90}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results90_120 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 90, ‘$lte’: 120}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results120_200 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 120, ‘$lte’: 200}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results200_300 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 200, ‘$lte’: 300}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results300_400 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 300, ‘$lte’: 400}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results400_10000 = self.zfdb.rent.aggregate([

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 300, ‘$lte’: 10000}}},

{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}

])

results0_30_ = list(results0_30)[0][“count”]

results30_60_ = list(results30_60)[0][“count”]

results60_90_ = list(results60_90)[0][“count”]

results90_120_ = list(results90_120)[0][“count”]

results120_200_ = list(results120_200)[0][“count”]

results200_300_ = list(results200_300)[0][“count”]

results300_400_ = list(results300_400)[0][“count”]

results400_10000_ = list(results400_10000)[0][“count”]

attr = [“0-30平方米”, “30-60平方米”, “60-90平方米”, “90-120平方米”, “120-200平方米”, “200-300平方米”, “300-400平方米”, “400+平方米”]

value = [

results0_30_, results30_60_, results60_90_, results90_120_, results120_200_, results200_300_, results300_400_, results400_10000_

]

return (attr, value)

数据展示:

展示饼图

def showPie(self, title, attr, value):

from pyecharts import Pie

pie = Pie(title)

pie.add(“aa”, attr, value, is_label_show=True)

pie.render()

展示矩形树图

def showTreeMap(self, title, data):

from pyecharts import TreeMap

data = data

treemap = TreeMap(title, width=1200, height=600)

treemap.add(“深圳”, data, is_label_show=True, label_pos=‘inside’, label_text_size=19)

treemap.render()

展示条形图

def showLine(self, title, attr, value):

from pyecharts import Bar

bar = Bar(title)

bar.add(“深圳”, attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,

xaxis_interval=0, xaxis_label_textsize=9,

legend_text_size=18, label_text_color=[“#000”])

bar.render()

展示词云

def showWorkCloud(self, content, image_filename, font_filename, out_filename):

d = path.dirname(name)

content = open(path.join(d, filename), ‘rb’).read()

基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight

为是否返回关键字的权重

tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)

text = " ".join(tags)

需要显示的背景图片

img = imread(path.join(d, image_filename))

指定中文字体, 不然会乱码的

wc = WordCloud(font_path=font_filename,

background_color=‘black’,

词云形状,

mask=img,

允许最大词汇

max_words=400,

最后

不知道你们用的什么环境,我一般都是用的Python3.6环境和pycharm解释器,没有软件,或者没有资料,没人解答问题,都可以免费领取(包括今天的代码),过几天我还会做个视频教程出来,有需要也可以领取~

给大家准备的学习资料包括但不限于:

Python 环境、pycharm编辑器/永久激活/翻译插件

python 零基础视频教程

Python 界面开发实战教程

Python 爬虫实战教程

Python 数据分析实战教程

python 游戏开发实战教程

Python 电子书100本

Python 学习路线规划

网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。

需要这份系统化学习资料的朋友,可以戳这里无偿获取

一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!

  • 20
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值