#!/usr/bin/python3.9
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021 #
# @Time : 2021/4/15 8:51
# @Author : # @Email : # @File : fangke_data_analysis_g1.py
# @Software: PyCharm
"""
成都各区域新开楼盘平均房价
数据源:房客网4.13号数据新开楼盘前100页
分析人 xxx
结论
建议
"""
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
FONT = FontProperties(fname='C:\Windows\Fonts\STXIHEI.TTF')
cd_data = pd.read_csv('../data/original_data/cd/2021-04-13.csv')
# csv文件在写入的时间一般是以“,”作为分割
# print(cd_data.columns) # 获取列名
# print(cd_data[' adress']) # 获得所有房子所在区域
# print(cd_data[' adress']==' 龙泉驿') # 数据筛选龙泉为True,其它False
# print(cd_data[cd_data[' adress']==' 龙泉驿']) # 筛选出龙泉数据
# 数据去重
# cd_data = cd_data.duplicated().value_counts()
cd_data = cd_data.drop_duplicates()
# 数据空值
# cd_data = cd_data.isnull()
# print()
# price = _150
# 异常值处理,因为暂时没法处理,我们仅作记录
# 数据分析
# print(cd_data.columns)
# print(cd_data[' adress'].values) # 获取值,形成列表
# print(cd_data[' adress'].value_counts()) # 获取值,计算值出现次数
# print(type(cd_data[' adress'].value_counts()))
mean_price = []
for county in cd_data[' adress'].value_counts().index:
print(county,end='')
print("数据如下:")
# print(cd_data[cd_data[' adress'] == county]) # 获取每个区域所有数据
# print(cd_data[cd_data[' adress'] == county][' price'])
# 异常数据处理
# print(cd_data[cd_data[' adress'] == county][' price'].replace(' 价格待定','1'))
county_price = cd_data[cd_data[' adress'] == county][' price'].str.strip().replace('价格待定', '1')
index = 1
sum = 0
county_price = pd.to_numeric(county_price)
for price in county_price:
if float(price)>1000:
index+=1
sum+= float(price)
mean = round(sum/index,2)
print(mean)
mean_price.append(mean)
print("------------------------------------")
# 画图
plt.bar(cd_data[' adress'].value_counts().index,mean_price,width=0.5,color = 'r')
plt.title("房客网数据分析\n"
" 分析人:xxxx",FontProperties=FONT)
plt.xticks(FontProperties=FONT,rotation = 60,fontsize = 9)
plt.show()
分析结果如下: