Python学习之数据分析盐城房价

一.知识准备

我们需要学习以下python库,包括爬虫获取数据的requests库,pandas读取cvs文件的库,BeautifulSoup网页截取库,csv保存数据到csv文件的库,numpy科学计算库,matplotlib数据可视化库,具体库知识学习可以下面评论留言

import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from matplotlib.pyplot import MultipleLocator
import matplotlib.pyplot as plt

二.知识运用

  • 获取安居库盐城市盐都区房价信息

requsts获取网页文本内容,BeautifulSoup定位需要的数据

def get_building_price():


    url = 'http://danke00.com/xinfang/1261'
    rtext = requests.get(url,timeout=30)
    soup = BeautifulSoup(rtext.text,'lxml')
    buliding_list = soup.find_all('div',{'class':'lp-list'})[0]
    #print(buliding_list)
    building_2_list = buliding_list.find_all('li',{'class':'list-item clearfix'})
    #print(building_2_list)
    #print(rtext.status_code)
    #print(len(building_2_list))
    build_name_list = []
    build_price_list = []
    for build in building_2_list:
        build_name = build.find_all('a',{'class':'tit'})
        build_price_div = build.find_all('div',{'class':'other fr'})
        build_price__em = build_price_div[0].find_all('em',{'class':'arial'})
        build_name_str = build_name[0].text.split('\n')[0]
        build_name_list.append(build_name_str)
        build_price_list.append(build_price__em[0].text)

    #print(build_name_list)
    #print(build_price_list)
    #buildings_name_list = building_2_list.find_all('a',{'class':'tit'})[0]
    #print(buildings_name_list)
    return build_name_list,build_price_list
  • 数据获取后对没有利用价值的数据进行清洗,保存到csv文件
def get_clean_data(build_name_list,build_price_list):
    build_price__name_int_list = []
    for i in range(len(build_price_list)):
        build_price__name_int_list.append((build_name_list[i],build_price_list[i]))
    print(build_price__name_int_list)
    Header = ['BuildName','Price']
    buildName = []
    buildPrice = []
    with open('yancheng_build_price.csv','w',encoding='utf-8',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(Header)
        for i,build in enumerate(build_price__name_int_list):
            if build[1] !='待定':
                build_name = build[0]
                buildName.append(build_name)
                build_price = int(build[1])
                buildPrice.append(build_price)
                row = [build_name]+[build_price]
                writer.writerow(row)

    return buildName,buildPrice
  • 从csv获取数据,进行数据可视化分析
def main():
    build_name_list,build_price_list = get_building_price()
    buildName,buildPrice = get_clean_data(build_name_list,build_price_list)
    build_name_price_list = pd.read_csv('yancheng_build_price.csv')
    print(build_name_price_list.info())

    build_name_price_list.plot(kind='line',x='BuildName',y='Price',title='盐城盐都区各在售小区房价曲线图',figsize=(10,5),color='red')
    x = np.arange(11)
    tickpoints = np.arange(11)
    plt.xticks(tickpoints, buildName)
    for a,b in zip(x,buildPrice):
        plt.text(a,b,b,ha='center',va='bottom',fontsize='10')
    # x_major_locator = MultipleLocator(1)
    # ax = plt.gca()
    # ax.xaxis.set_major_locator(x_major_locator)

    plt.show()

if __name__ == '__main__':
    main()

三.项目完整代码

"""""
    作者:cpz
    目的:盐城房价分析
    版本:1.0
    时间:30/07/2019
"""""


import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from matplotlib.pyplot import MultipleLocator
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def get_building_price():


    url = 'http://danke00.com/xinfang/1261'
    rtext = requests.get(url,timeout=30)
    soup = BeautifulSoup(rtext.text,'lxml')
    buliding_list = soup.find_all('div',{'class':'lp-list'})[0]
    #print(buliding_list)
    building_2_list = buliding_list.find_all('li',{'class':'list-item clearfix'})
    #print(building_2_list)
    #print(rtext.status_code)
    #print(len(building_2_list))
    build_name_list = []
    build_price_list = []
    for build in building_2_list:
        build_name = build.find_all('a',{'class':'tit'})
        build_price_div = build.find_all('div',{'class':'other fr'})
        build_price__em = build_price_div[0].find_all('em',{'class':'arial'})
        build_name_str = build_name[0].text.split('\n')[0]
        build_name_list.append(build_name_str)
        build_price_list.append(build_price__em[0].text)

    #print(build_name_list)
    #print(build_price_list)
    #buildings_name_list = building_2_list.find_all('a',{'class':'tit'})[0]
    #print(buildings_name_list)
    return build_name_list,build_price_list

def get_clean_data(build_name_list,build_price_list):
    build_price__name_int_list = []
    for i in range(len(build_price_list)):
        build_price__name_int_list.append((build_name_list[i],build_price_list[i]))
    print(build_price__name_int_list)
    Header = ['BuildName','Price']
    buildName = []
    buildPrice = []
    with open('yancheng_build_price.csv','w',encoding='utf-8',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(Header)
        for i,build in enumerate(build_price__name_int_list):
            if build[1] !='待定':
                build_name = build[0]
                buildName.append(build_name)
                build_price = int(build[1])
                buildPrice.append(build_price)
                row = [build_name]+[build_price]
                writer.writerow(row)

    return buildName,buildPrice





def main():
    build_name_list,build_price_list = get_building_price()
    buildName,buildPrice = get_clean_data(build_name_list,build_price_list)
    build_name_price_list = pd.read_csv('yancheng_build_price.csv')
    print(build_name_price_list.info())

    build_name_price_list.plot(kind='line',x='BuildName',y='Price',title='盐城盐都区各在售小区房价曲线图',figsize=(10,5),color='red')
    x = np.arange(11)
    tickpoints = np.arange(11)
    plt.xticks(tickpoints, buildName)
    for a,b in zip(x,buildPrice):
        plt.text(a,b,b,ha='center',va='bottom',fontsize='10')
    # x_major_locator = MultipleLocator(1)
    # ax = plt.gca()
    # ax.xaxis.set_major_locator(x_major_locator)

    plt.show()

if __name__ == '__main__':
    main()

四.知识总结

目前,只是把网站数据爬取过来进行可视化显示,下面进行库的不断学习,争取获取更有利于研究的数据,加油!!!

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值