功课
前端框架:
- python web微框架——flask,https://dormousehole.readthedocs.io/en/latest/#
- python web应用框架——Django
可视化引擎:
- 蚂蚁金服 AntV 数据可视化团队推出的AntV 地理可视化引擎 L7 ,https://ant.design/index-cn
- mapbox 是一个开源的地图引擎、 强大的前段地图框架mapboxgl。 前端使用 ant.design 进行布局,https://blog.csdn.net/supermapsupport/article/details/78343391
- 百度API 3D ,http://lbsyun.baidu.com/solutions/visualization
- inMap 丰富的图层、更好的用户体验、大数据地理可视化库。
房价预测系:
7. K-Means聚类地理信息可视化.https://zhuanlan.zhihu.com/p/30138130
8. 《DataFocus 数据可视化》 第四章 地理信息可视化.https://www.douban.com/note/725396232/
9. 数据挖掘——房价项目预测(四)matplotlib与Seaborn数据可视化学习 .https://blog.csdn.net/weixin_41975471/article/details/106235600. Matplotlib 是一个 Python 的 2D绘图库
10. FineBI数据可视化软件.http://www.fanruansem.com/、http://www.fanruansem.com/finebi
11. OurwayBI.https://tv.sohu.com/v/dXMvMTgxMjczMjU0Lzk0MjcwNjk5LnNodG1s.html
1、分析数据需求
数据来自某网https://cd.lianjia.com/
需要用
- 地址
- 总价
- 售价
- 小区名称
- 所在区域
- 经度
- 纬度
用地址是因为需要在百度API里得到经纬度做定位,也用来做label;
用总价、售价、小区名称做离散图和label
等等
2、分析网页结构
略
3、爬爬虫虫的编写
# coding=UTF-8
import importlib
import json
import requests
import re,sys
from multiprocessing import Process
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
importlib.reload(sys)
ua = UserAgent()
#循环下一页
#正则表示、去重
title_link = ['wuhou','jinniu','shuangliu']
for pre in range(3):
for i in range(1, 2):
# 循环构造url
for n in range(100):
url = 'https://cd.lianjia.com/ershoufang/' + str(title_link[pre]) + '/pg' + str(n) + '{}/'
print(str(title_link[pre]) + '的第' + str(n) + '页')
k = url.format(i)
# 添加请求头
headers = {'Referer': 'https://cd.lianjia.com/ershoufang/', 'user-agent': ua.random}
res = requests.get(k, headers=headers)
# 基于正则表达式来解析网页内容,拿到所有的详情url
# 原始可能是这么做的,但是后来发现bs4给我们提供了更方便的方法来取得各元素的内容
# 正则表达式最重要的两个东西,.任意匹配字符,*匹配任意次数,?以html结束
text = res.text
re_set = re.compile('https://cd.lianjia.com/ershoufang/[0-9]*.?html')
re_get = re.findall(re_set, text)
# 去重
lst2 = {}.fromkeys(re_get).keys()
# 获得经纬度
for name in lst2:
res = requests.get(name, headers=headers)
info = {}
text2 = res.text
soup = BeautifulSoup(text2, 'html.parser')
try:
info['地址'] = soup.select('.main')[0].text
info['总价【万】'] = soup.select('.total')[0].text
info['每平方售价【万/平方米】'] = soup.select('.unitPriceValue')[0].text
info['小区名称'] = soup.select('.info')[0].text
info['区'] = soup.select('.info a')[0].text + '区'
info['街道'] = soup.select('.info a')[1].text
info['房屋朝向'] = soup.select('.type .mainInfo')[0].text
info['所在楼层'] = soup.select('.content ul li')[1].text
info['梯户比例'] = soup.select('.content ul li')[9].text
info['配备电梯'] = soup.select('.content ul li')[10].text
# 根据地址获取对应经纬度,通过高德地图的api接口来进行
mc = soup.select('.info')[0].text
location1 = '成都' + mc
# print(location1)
base = 'https://restapi.amap.com/v3/geocode/geo?key=32420527fb3f52a21761956860a27921&address=' + location1
response = requests.get(base)
result = json.loads(response.text)
info['经纬度'] = result['geocodes'][0]['location']
info['经度'] = info['经纬度'][-20:-11]
info['纬度'] = info['经纬度'][-9:-1]
print(info)
except:
with open('/opt/dataFile/error/' + str(title_link[pre]) + '_error.csv', 'a',encoding='utf-8') as error:
error.write(base + '\n')
continue
with open('/opt/dataFile/data/' + str(title_link[pre]) + '.csv', 'a', encoding='utf-8')as data:
data.write(str(info) + ',\n')
4、可视化
使用python
import plotly_express as px # 可视化模块
import plotly.offline # 生成html文件模块
import numpy # 数据格式转换模块
import pandas as pd # 数据格式转换模块
# plotly_express 二维散点图
def plotly_Express_scatter():
gapminder = px.data.gapminder()
px.scatter(gapminder.query("year==2007"), x="gdpPercap", y="lifeExp",
size="pop", color="continent", hover_name="country", log_x=True,
size_max=60)
# plotly_express 二维散点图测试
def plotly_Express_scatter_test():
data = 'E:/Sourese/lianjia/data/list_data.json'
# 转成DataFram数据结构
df = pd.read_json(data,encoding='UTF-8')
fig = px.scatter(df, x="Price", y="Acreage",size="Tolprice", color="Area",hover_name="Area", log_x=True,size_max=60)
# 生成Html文件
# plotly.offline.plot(fig,filename='D:/_workspace/HBuilderX/可视化大作业/file/scatter_list_data.html')
# print(df["Area"])
# plotly_express 三维散点图测试
def plotly_Express_scatter_3d():
df = px.data.election()
fig = px.scatter_3d(df, x="Joly", y="Coderre", z="Bergeron", color="winner", size="total", hover_name="district",
symbol="result", color_discrete_map={"Joly": "blue", "Bergeron": "green", "Coderre": "red"})
with open('E:Sourese/lianjia/Textdata/test.txt' ,'a' , encoding='UTF-8') as test:
test.writelines(str(fig))
def plotly_Express_scatter_3d_test():
data = 'E:/Sourese/lianjia/data/list_data.json'
df = px.data.election()
fig = px.scatter_3d(df, x="Joly", y="Coderre", z="Bergeron", color="Area", size="Area", hover_name="Area",
symbol="Area", color_discrete_map={"Joly": "blue", "Bergeron": "green", "Coderre": "red"})
plotly.offline.plot(fig)
# plotly_express 二维柱状图
def bar_11():
data = pd.read_json("E:/Sourese/lianjia/Textdata/Sheet2.json" ,'r',encoding='UTF-8')
print(data)
# Str 类型转Pandas.DataFrame类型;Str——>IO(文件对象)——>DataFrame类型
fig = px.bar(data , x='Count' , y='Area' , text='Count',orientation='h',title='dj',template='plotly_white')
plotly.offline.plot(fig ,filename='D:/_workspace/HBuilderX/可视化大作业/file/bar_11.html')
if __name__ == '__main__':
# bar_11()
# plotly_Express_scatter()
plotly_Express_scatter_test()
# area_number()
# plotly_Express_scatter_3d()
# plotly_Express_scatter_3d_test()
Python词云.py
from PIL import Image as image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import jieba
def GetWordCloud():
path_txt = 'E://Sourese//lianjia//data//data.txt'
#path_img = "E://Sourese//lianjia//data//house.png"
f = open(path_txt, 'r', encoding='UTF-8').read()
#background_image = np.array(image.open(path_img))
# 结巴分词,生成字符串,如果不通过分词,无法直接生成正确的中文词云,感兴趣的朋友可以去查一下,有多种分词模式
# Python join() 方法用于将序列中的元素以指定的字符连接生成一个新的字符串。
cut_text = " ".join(jieba.cut(f))
wordcloud = WordCloud(
max_words = 120,
# 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
background_color='white',
# mask参数=图片背景,必须要写上,另外有mask参数再设定宽高是无效的
#mask=background_image
).generate(cut_text)
# 生成颜色值
#image_colors = ImageColorGenerator(background_image)
# 下面代码表示显示图片
plt.imshow(wordcloud.recolor(), interpolation="bilinear")
plt.axis("off")
plt.show()
def GetWordCoud2():
#path_image = 'E://Sourese//lianjia//data//house.png'
#mask = np.array(image.open(path_image))
with open('E://Sourese//lianjia//data//wordcloud.txt','r',encoding='UTF-8') as data:
text = data.read()
wordCloud = WordCloud(
font_path='C:/Windows/Fonts/simfang.ttf',
background_color='white',
#mask = mask,
mode= 'green'
).generate(text)
image_produce = WordCloud.to_image(wordCloud)
image_produce.show()
if __name__ == '__main__':
GetWordCloud()
# GetWordCoud2()
"""
WordCloud参数讲解:
font_path表示用到字体的路径
width和height表示画布的宽和高
prefer_horizontal可以调整词云中字体水平和垂直的多少
mask即掩膜,产生词云背景的区域
scale:计算和绘图之间的缩放
min_font_size设置最小的字体大小
max_words设置字体的多少
stopwords设置禁用词
background_color设置词云的背景颜色
max_font_size设置字体的最大尺寸
mode设置字体的颜色 但设置为RGBA时背景透明
relative_scaling设置有关字体大小的相对字频率的重要性
regexp设置正则表达式
collocations 是否包含两个词的搭配
"""