一、内容来源
数据集分享:链接:https://pan.baidu.com/s/1nU29LEfrILve3-ERqccUTQ
提取码:6ptf
二、学习笔记(广州)
3σ原则为
数值分布在(μ-σ,μ+σ)中的概率为0.6827
数值分布在(μ-2σ,μ+2σ)中的概率为0.9545
数值分布在(μ-3σ,μ+3σ)中的概率为0.9973
数据处理代码1:找出异常值,并通过线性插值的方式处理掉
import numpy as np
import pandas as pd
#1 读取数据
filename = 'GuangzhouPM20100101_20151231.csv'
#df = pd.read_csv(filename,encoding='utf-8',dtype=str)
#df = pd.read_csv(filename,encoding='utf-8')
df = pd.read_csv(filename,encoding='utf-8',usecols=[0,1,2,3,4,5,10])
#2 查看数据基本情况
print('head--------------------------------\n',df.head())
print('deacribe----------------------------\n',df.describe())
print('info--------------------------------\n',df.info())
# 查找HUMI中小于0的值
temp_list = df[df.HUMI < 0].index.tolist()
print(temp_list)
df["HUMI_new1"] = df["HUMI"]
for i in temp_list:
df["HUMI_new1"][i] = np.nan
#df.loc['HUMI_new1',i] = np.nan
df["HUMI_new2"]=df["HUMI_new1"].interpolate()
# 保存文件
df.to_csv('gz1.csv')
数据处理代码2:找出小于3σ的异常数据并处理掉
import numpy as np
import pandas as pd
#读取数据
filename = 'gz1.csv'
df = pd.read_csv(filename,encoding='utf-8',usecols=[1,2,3,4,5,6,9])
print('-------------------------head--------------------------\n',df.head())
print('------------------------deacribe------------------------\n',df.describe())
print('-------------------------info---------------------------\n',df.info())
HUMI_mean = df.HUMI_new2.mean()
HUMI_std = df.HUMI_new2.std()
print(HUMI_mean-3 * HUMI_std, HUMI_mean + 3 *HUMI_std)
#求出HUMI_new2列中数据小于3倍标准差的数据
index_list = df[df.HUMI_new2 < HUMI_mean-3 * HUMI_std].index.tolist()
value_list = df[df.HUMI_new2 < HUMI_mean-3 * HUMI_std]
print("there are {} item:".format(len(index_list)))
print(index_list)
print(value_list)
#将这些数改为3倍标准差(下边界)
df["HUMI_new3"] = df["HUMI_new2"]
for i in index_list:
df["HUMI_new3"][i] = int(HUMI_mean-3 * HUMI_std)
#保存文件
df.to_csv("gz2.csv")
三、作业(北京)
import numpy as np
import pandas as pd
#读取数据
filename = 'BeijingPM20100101_20151231.csv'
df = pd.read_csv(filename,encoding='utf-8')
#查看数据基本情况
print('-------------------------head--------------------------\n',df.head())
print('------------------------deacribe------------------------\n',
df["HUMI"].describe(),df["PRES"].describe(),df["TEMP"].describe())
print('-----------------------缺失值---------------------------\n',
df.isnull().sum().sort_values(ascending=False))
#对缺失值进行线性插值
df["HUMI"]=df["HUMI"].interpolate()
df["PRES"]=df["PRES"].interpolate()
df["TEMP"]=df["TEMP"].interpolate()
print('-----------------------缺失值---------------------------\n',
df.isnull().sum().sort_values(ascending=False))
#对超过3倍标准差的高度异常数据,修改为3倍标准差的数值
#这里只写一个,其他两个同理,其实观察describe结果可知没有数据超过3倍标准差
HUMI_mean = df.HUMI.mean()
HUMI_std = df.HUMI.std()
index_list = df[df.HUMI > HUMI_mean+3 * HUMI_std].index.tolist()
for i in index_list:
df["HUMI"][i] = int(HUMI_mean+3 * HUMI_std)
#对PM_Dongsi、PM_Dongsihuan、PM_Nongzhanguan三列中超过500的数据,修改为500
print('------------------------deacribe(before)------------------------\n',
df["PM_Dongsi"].describe(),df["PM_Dongsihuan"].describe(),df["PM_Dongsihuan"].describe())
for i in df[df.PM_Dongsi > 500 ].index.tolist():
df["PM_Dongsi"][i] = 500
for i in df[df.PM_Dongsihuan > 500 ].index.tolist():
df["PM_Dongsihuan"][i] = 500
for i in df[df.PM_Nongzhanguan > 500 ].index.tolist():
df["PM_Nongzhanguan"][i] = 500
print('------------------------deacribe(after)------------------------\n',
df["PM_Dongsi"].describe(),df["PM_Dongsihuan"].describe(),df["PM_Dongsihuan"].describe())
#修改cbwd列中值为“cv"的单元格,其值用后项数据填充。
temp_index = df[df.cbwd =='cv' ].index.tolist()
for i in reversed(temp_index): #倒叙遍历
df["cbwd"][i] = df["cbwd"][i+1]
print("after",len(df[df.cbwd =='cv' ].index.tolist()))
'''#方法2:全部数据倒序遍历
print("before",len(df[df.cbwd =='cv' ].index.tolist()))
for i in reversed(range(len(df))):
if df["cbwd"][i] =='cv':
df["cbwd"][i] = df["cbwd"][i+1]
print("after",len(df[df.cbwd =='cv' ].index.tolist()))
'''
#保存文件
df.to_csv("bj.csv")