2020年链家网成都二手房销售情况可视化总结

最新推荐文章于 2024-02-05 17:30:00 发布

wenjie1388

最新推荐文章于 2024-02-05 17:30:00 发布

阅读量2.7k

点赞数 2

分类专栏： PYTHON专栏文章标签： python 可视化数据可视化

本文链接：https://blog.csdn.net/qq_41935112/article/details/107109633

版权

PYTHON专栏专栏收录该内容

2 篇文章

订阅专栏

功课

前端框架：

python web微框架——flask，https://dormousehole.readthedocs.io/en/latest/#
python web应用框架——Django

可视化引擎：

蚂蚁金服 AntV 数据可视化团队推出的AntV 地理可视化引擎 L7 ,https://ant.design/index-cn
mapbox 是一个开源的地图引擎、强大的前段地图框架mapboxgl。前端使用 ant.design 进行布局,https://blog.csdn.net/supermapsupport/article/details/78343391
百度API 3D ,http://lbsyun.baidu.com/solutions/visualization
inMap 丰富的图层、更好的用户体验、大数据地理可视化库。

房价预测系：
7. K-Means聚类地理信息可视化.https://zhuanlan.zhihu.com/p/30138130
8. 《DataFocus 数据可视化》第四章地理信息可视化.https://www.douban.com/note/725396232/
9. 数据挖掘——房价项目预测（四）matplotlib与Seaborn数据可视化学习 .https://blog.csdn.net/weixin_41975471/article/details/106235600. Matplotlib 是一个 Python 的 2D绘图库
10. FineBI数据可视化软件.http://www.fanruansem.com/、http://www.fanruansem.com/finebi
11. OurwayBI.https://tv.sohu.com/v/dXMvMTgxMjczMjU0Lzk0MjcwNjk5LnNodG1s.html

1、分析数据需求

数据来自某网https://cd.lianjia.com/
需要用

地址
总价
售价
小区名称
所在区域
经度
纬度

用地址是因为需要在百度API里得到经纬度做定位，也用来做label;
用总价、售价、小区名称做离散图和label
等等

2、分析网页结构

略

3、爬爬虫虫的编写


# coding=UTF-8
import importlib
import json
import requests
import re,sys

from multiprocessing import Process
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

importlib.reload(sys)

ua = UserAgent()

#循环下一页
#正则表示、去重
title_link = ['wuhou','jinniu','shuangliu']

for pre in range(3):
        for i in range(1, 2):
            # 循环构造url
            for n in range(100):
                url = 'https://cd.lianjia.com/ershoufang/' + str(title_link[pre]) + '/pg' + str(n) + '{}/'
                print(str(title_link[pre]) + '的第' + str(n) + '页')
                k = url.format(i)
                # 添加请求头
                headers = {'Referer': 'https://cd.lianjia.com/ershoufang/', 'user-agent': ua.random}
                res = requests.get(k, headers=headers)
                # 基于正则表达式来解析网页内容，拿到所有的详情url
                # 原始可能是这么做的，但是后来发现bs4给我们提供了更方便的方法来取得各元素的内容
                # 正则表达式最重要的两个东西，.任意匹配字符，*匹配任意次数，？以html结束
                text = res.text
                re_set = re.compile('https://cd.lianjia.com/ershoufang/[0-9]*.?html')
                re_get = re.findall(re_set, text)

                # 去重
                lst2 = {}.fromkeys(re_get).keys()
                # 获得经纬度
                for name in lst2:
                    res = requests.get(name, headers=headers)
                    info = {}
                    text2 = res.text
                    soup = BeautifulSoup(text2, 'html.parser')
                    try:
                        info['地址'] = soup.select('.main')[0].text
                        info['总价【万】'] = soup.select('.total')[0].text
                        info['每平方售价【万/平方米】'] = soup.select('.unitPriceValue')[0].text
                        info['小区名称'] = soup.select('.info')[0].text
                        info['区'] = soup.select('.info a')[0].text + '区'
                        info['街道'] = soup.select('.info a')[1].text
                        info['房屋朝向'] = soup.select('.type .mainInfo')[0].text
                        info['所在楼层'] = soup.select('.content ul li')[1].text
                        info['梯户比例'] = soup.select('.content ul li')[9].text
                        info['配备电梯'] = soup.select('.content ul li')[10].text
                        # 根据地址获取对应经纬度，通过高德地图的api接口来进行
                        mc = soup.select('.info')[0].text
                        location1 = '成都' + mc
                        # print(location1)
                        base = 'https://restapi.amap.com/v3/geocode/geo?key=32420527fb3f52a21761956860a27921&address=' + location1
                        response = requests.get(base)
                        result = json.loads(response.text)
                        info['经纬度'] = result['geocodes'][0]['location']
                        info['经度'] = info['经纬度'][-20:-11]
                        info['纬度'] = info['经纬度'][-9:-1]
                        print(info)
                    except:
                        with open('/opt/dataFile/error/' + str(title_link[pre]) + '_error.csv', 'a',encoding='utf-8') as error:
                            error.write(base + '\n')
                        continue
                    with open('/opt/dataFile/data/' + str(title_link[pre]) + '.csv', 'a', encoding='utf-8')as data:
                        data.write(str(info) + ',\n')

2020年链家网成都二手房销售部分数据.tar

4、可视化

使用python

import plotly_express as px     # 可视化模块
import plotly.offline           # 生成html文件模块
import numpy                    # 数据格式转换模块
import pandas as pd             # 数据格式转换模块


# plotly_express 二维散点图
def plotly_Express_scatter():
    gapminder = px.data.gapminder()
    px.scatter(gapminder.query("year==2007"), x="gdpPercap", y="lifeExp",
               size="pop", color="continent", hover_name="country", log_x=True,
               size_max=60)

# plotly_express 二维散点图测试
def plotly_Express_scatter_test():
    data = 'E:/Sourese/lianjia/data/list_data.json'
    # 转成DataFram数据结构
    df = pd.read_json(data,encoding='UTF-8')
    fig = px.scatter(df, x="Price", y="Acreage",size="Tolprice", color="Area",hover_name="Area", log_x=True,size_max=60)
    # 生成Html文件
    # plotly.offline.plot(fig,filename='D:/_workspace/HBuilderX/可视化大作业/file/scatter_list_data.html')
    # print(df["Area"])


# plotly_express 三维散点图测试
def plotly_Express_scatter_3d():
    df = px.data.election()
    fig = px.scatter_3d(df, x="Joly", y="Coderre", z="Bergeron", color="winner", size="total", hover_name="district",
                        symbol="result", color_discrete_map={"Joly": "blue", "Bergeron": "green", "Coderre": "red"})
    with open('E:Sourese/lianjia/Textdata/test.txt' ,'a' , encoding='UTF-8') as test:
        test.writelines(str(fig))


def plotly_Express_scatter_3d_test():
    data = 'E:/Sourese/lianjia/data/list_data.json'
    df = px.data.election()
    fig = px.scatter_3d(df, x="Joly", y="Coderre", z="Bergeron", color="Area", size="Area", hover_name="Area",
                        symbol="Area", color_discrete_map={"Joly": "blue", "Bergeron": "green", "Coderre": "red"})
    plotly.offline.plot(fig)


# plotly_express 二维柱状图
def bar_11():
    data = pd.read_json("E:/Sourese/lianjia/Textdata/Sheet2.json" ,'r',encoding='UTF-8')
    print(data)
    # Str 类型转Pandas.DataFrame类型；Str——>IO（文件对象）——>DataFrame类型
    fig = px.bar(data , x='Count' , y='Area' , text='Count',orientation='h',title='dj',template='plotly_white')
    plotly.offline.plot(fig ,filename='D:/_workspace/HBuilderX/可视化大作业/file/bar_11.html')

if __name__ == '__main__':
    # bar_11()
    # plotly_Express_scatter()
    plotly_Express_scatter_test()
    # area_number()
    # plotly_Express_scatter_3d()
    # plotly_Express_scatter_3d_test()

Python词云.py

from PIL import Image as image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import jieba


def GetWordCloud():
      path_txt = 'E://Sourese//lianjia//data//data.txt'
      #path_img = "E://Sourese//lianjia//data//house.png"
      f = open(path_txt, 'r', encoding='UTF-8').read()
      #background_image = np.array(image.open(path_img))
      # 结巴分词，生成字符串，如果不通过分词，无法直接生成正确的中文词云,感兴趣的朋友可以去查一下，有多种分词模式
      # Python join() 方法用于将序列中的元素以指定的字符连接生成一个新的字符串。
      cut_text = " ".join(jieba.cut(f))
      wordcloud = WordCloud(
          max_words = 120,
          # 设置字体，不然会出现口字乱码，文字的路径是电脑的字体一般路径，可以换成别的
          font_path="C:/Windows/Fonts/simfang.ttf",
          background_color='white',
          # mask参数=图片背景，必须要写上，另外有mask参数再设定宽高是无效的
          #mask=background_image
      ).generate(cut_text)
      # 生成颜色值
      #image_colors = ImageColorGenerator(background_image)
      # 下面代码表示显示图片
      plt.imshow(wordcloud.recolor(), interpolation="bilinear")
      plt.axis("off")
      plt.show()


def GetWordCoud2():
   #path_image = 'E://Sourese//lianjia//data//house.png'
   #mask = np.array(image.open(path_image))
   with open('E://Sourese//lianjia//data//wordcloud.txt','r',encoding='UTF-8') as data:
      text = data.read()
      wordCloud = WordCloud(
         font_path='C:/Windows/Fonts/simfang.ttf',
         background_color='white',
         #mask = mask,
         mode= 'green'
      ).generate(text)
      image_produce = WordCloud.to_image(wordCloud)
      image_produce.show()


if __name__ == '__main__':
    GetWordCloud()
   # GetWordCoud2()



"""
WordCloud参数讲解：
            font_path表示用到字体的路径
            width和height表示画布的宽和高
            prefer_horizontal可以调整词云中字体水平和垂直的多少
            mask即掩膜，产生词云背景的区域
            scale:计算和绘图之间的缩放
            min_font_size设置最小的字体大小
            max_words设置字体的多少
            stopwords设置禁用词
            background_color设置词云的背景颜色
            max_font_size设置字体的最大尺寸
            mode设置字体的颜色 但设置为RGBA时背景透明
            relative_scaling设置有关字体大小的相对字频率的重要性
            regexp设置正则表达式
            collocations 是否包含两个词的搭配
"""