for div in divs:
ps = div.find_all(“p”)
try: # 捕获异常,因为页面中有些数据没有被填写完整,或者被插入了一条广告,则会没有相应的标签,所以会报错
for index, p in enumerate(ps): # 从源码中可以看出,每一条 p 标签都有我们想要的信息,故在此遍历 p 标签,
text = p.text.strip()
print(text) # 输出看看是否为我们想要的信息
print(“===================================”)
爬取并存进 MongoDB 数据库
roomMsg = ps[1].text.split(“|”)
rentMsg 这样处理是因为有些信息未填写完整,导致对象报空
area = roomMsg[2].strip()[:len(roomMsg[2]) - 2]
rentMsg = self.getRentMsg(
ps[0].text.strip(),
roomMsg[1].strip(),
int(float(area)),
int(ps[len(ps) - 1].text.strip()[:len(ps[len(ps) - 1].text.strip()) - 3]),
ps[2].text.strip(),
ps[3].text.strip(),
ps[2].text.strip()[:2],
roomMsg[3],
)
rent.insert(rentMsg)
except:
continue
数据分析:
求一个区的房租单价(平方米/元)
def getAvgPrice(self, region):
areaPinYin = self.getPinyin(region=region)
collection = self.zfdb[areaPinYin]
totalPrice = collection.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'region’, ‘total_price’: {‘ s u m ′ : ′ sum': ' sum′:′price’}}}])
totalArea = collection.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'region’, ‘total_area’: {‘ s u m ′ : ′ sum': ' sum′:′area’}}}])
totalPrice2 = list(totalPrice)[0][“total_price”]
totalArea2 = list(totalArea)[0][“total_area”]
return totalPrice2 / totalArea2
获取各个区 每个月一平方米需要多少钱
def getTotalAvgPrice(self):
totalAvgPriceList = []
totalAvgPriceDirList = []
for index, region in enumerate(self.getAreaList()):
avgPrice = self.getAvgPrice(region)
totalAvgPriceList.append(round(avgPrice, 3))
totalAvgPriceDirList.append({“value”: round(avgPrice, 3), “name”: region + " " + str(round(avgPrice, 3))})
return totalAvgPriceDirList
获取各个区 每一天一平方米需要多少钱
def getTotalAvgPricePerDay(self):
totalAvgPriceList = []
for index, region in enumerate(self.getAreaList()):
avgPrice = self.getAvgPrice(region)
totalAvgPriceList.append(round(avgPrice / 30, 3))
return (self.getAreaList(), totalAvgPriceList)
获取各区统计样本数量
def getAnalycisNum(self):
analycisList = []
for index, region in enumerate(self.getAreaList()):
collection = self.zfdb[self.pinyinDir[region]]
print(region)
totalNum = collection.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'total_num': {'sum’: 1}}}])
totalNum2 = list(totalNum)[0][“total_num”]
analycisList.append(totalNum2)
return (self.getAreaList(), analycisList)
获取各个区的房源比重
def getAreaWeight(self):
result = self.zfdb.rent.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'region’, ‘weight’: {‘$sum’: 1}}}])
areaName = []
areaWeight = []
for item in result:
if item[“_id”] in self.getAreaList():
areaWeight.append(item[“weight”])
areaName.append(item[“_id”])
print(item[“_id”])
print(item[“weight”])
print(type(item))
return (areaName, areaWeight)
获取 title 数据,用于构建词云
def getTitle(self):
collection = self.zfdb[“rent”]
queryArgs = {}
projectionFields = {‘_id’: False, ‘title’: True} # 用字典指定需要的字段
searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
content = ‘’
for result in searchRes:
print(result[“title”])
content += result[“title”]
return content
获取户型数据(例如:3 室 2 厅)
def getRooms(self):
results = self.zfdb.rent.aggregate([{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …oup': {'_id': 'rooms’, ‘weight’: {‘$sum’: 1}}}])
roomList = []
weightList = []
for result in results:
roomList.append(result[“_id”])
weightList.append(result[“weight”])
print(list(result))
return (roomList, weightList)
获取租房面积
def getAcreage(self):
results0_30 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 0, ‘$lte’: 30}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results30_60 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 30, ‘$lte’: 60}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results60_90 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 60, ‘$lte’: 90}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results90_120 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 90, ‘$lte’: 120}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results120_200 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 120, ‘$lte’: 200}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results200_300 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 200, ‘$lte’: 300}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results300_400 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 300, ‘$lte’: 400}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results400_10000 = self.zfdb.rent.aggregate([
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …h': {'area': {'gt’: 300, ‘$lte’: 10000}}},
{‘KaTeX parse error: Expected '}', got 'EOF' at end of input: …'', 'count': {'sum’: 1}}}
])
results0_30_ = list(results0_30)[0][“count”]
results30_60_ = list(results30_60)[0][“count”]
results60_90_ = list(results60_90)[0][“count”]
results90_120_ = list(results90_120)[0][“count”]
results120_200_ = list(results120_200)[0][“count”]
results200_300_ = list(results200_300)[0][“count”]
results300_400_ = list(results300_400)[0][“count”]
results400_10000_ = list(results400_10000)[0][“count”]
attr = [“0-30平方米”, “30-60平方米”, “60-90平方米”, “90-120平方米”, “120-200平方米”, “200-300平方米”, “300-400平方米”, “400+平方米”]
value = [
results0_30_, results30_60_, results60_90_, results90_120_, results120_200_, results200_300_, results300_400_, results400_10000_
]
return (attr, value)
数据展示:
展示饼图
def showPie(self, title, attr, value):
from pyecharts import Pie
pie = Pie(title)
pie.add(“aa”, attr, value, is_label_show=True)
pie.render()
展示矩形树图
def showTreeMap(self, title, data):
from pyecharts import TreeMap
data = data
treemap = TreeMap(title, width=1200, height=600)
treemap.add(“深圳”, data, is_label_show=True, label_pos=‘inside’, label_text_size=19)
treemap.render()
展示条形图
def showLine(self, title, attr, value):
from pyecharts import Bar
bar = Bar(title)
bar.add(“深圳”, attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,
xaxis_interval=0, xaxis_label_textsize=9,
legend_text_size=18, label_text_color=[“#000”])
bar.render()
展示词云
def showWorkCloud(self, content, image_filename, font_filename, out_filename):
d = path.dirname(name)
content = open(path.join(d, filename), ‘rb’).read()
基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight
为是否返回关键字的权重
tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)
text = " ".join(tags)
需要显示的背景图片
img = imread(path.join(d, image_filename))
指定中文字体, 不然会乱码的
wc = WordCloud(font_path=font_filename,
background_color=‘black’,
词云形状,
mask=img,
允许最大词汇
max_words=400,
最后
不知道你们用的什么环境,我一般都是用的Python3.6环境和pycharm解释器,没有软件,或者没有资料,没人解答问题,都可以免费领取(包括今天的代码),过几天我还会做个视频教程出来,有需要也可以领取~
给大家准备的学习资料包括但不限于:
Python 环境、pycharm编辑器/永久激活/翻译插件
python 零基础视频教程
Python 界面开发实战教程
Python 爬虫实战教程
Python 数据分析实战教程
python 游戏开发实战教程
Python 电子书100本
Python 学习路线规划
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!