import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
第一步:剔除异常样本、重置索引、将特征与标签分离
一定要先剔除异常样本,再提取标签。
原因:假设一共有100个样本,如果先按照最初的样本顺序提取100个标签,然后再剔除异常样本,剔除后还剩下90个非异常样本,那么这90个非异常样本就和刚才的100个标签无法一一对应了。
#将离散特征与连续特征分开
cate = weather.columns[weather.dtypes == 'object'].tolist()
cloud = ['Cloud9am','Cloud3pm']
cate = cate + cloud
sequen = weather.columns.tolist()
for i in cate:
sequen.remove(i)
sequen
#给每个连续特征下的样本标记Z分数
for col in sequen:
weather.loc[:,col+'_z'] = (weather.loc[:,col] - weather.loc[:,col].mean()) / weather.loc[:,col].std()
weather.head()
#剔除Z分数大于2的异常样本
for col in sequen:
weather_clear = weather.drop(weather[weather[col+'_z'] > 2].index)
weather_clear
#剔除完异常值,从连续特征列表中去除Z分数列
for col in sequen:
del weather_clear[col+'_z']
weather_clear
#重新配置索引
weather_clear.index = range(weather_clear.shape[0])
#提取标签,存储数据
weather_clear.loc[weather_clear['RainTomorrow'] == 'Yes','RainTomorrow'] = 1
weather_clear.loc[weather_clear['RainTomorrow'] == 'Yes','RainTomorrow'] = 0
target = weather_clear['RainTomorrow']
weather_clear.to_csv('./handled/ML/weather_clear.csv')
target.to_csv('./handled/ML/target.csv')
第二步:处理困难特征
#匿名函数
weather_clear_cuthard.loc[:,'Date'] = weather_clear_cuthard.loc[:,'Date'].apply(lambda x:int(x.split('-')[1]))
#修改列名
weather_clear_cuthard = weather_clear_cuthard.rename(columns = {'Date':'Month'})
#特征创造
weather_clear_cuthard.loc[weather_clear_cuthard['Rainfall'] >= 1,'RainToday'] = 1
weather_clear_cuthard.loc[weather_clear_cuthard['Rainfall'] < 1,'RainToday'] = 0
第三步:重新梳理离散、连续特征
经过第二步处理困难特征以后,有些特征名被修改,还有一些新的特征被构建,使得我们不得不重新梳理特征
total_columns = weather_clear_cuthard.columns.tolist()
cate = weather_clear_cuthard.columns[weather_clear_cuthard.dtypes == 'object'].tolist()
cate.append('Month')
cate.append('RainToday')
cloud = ['Cloud9am','Cloud3pm']
cate = cate + cloud
for col in cate:
total_columns.remove(col)
sequen = total_columns
len(weather_clear_cuthard.columns.tolist()) == len(cate) + len(sequen)
第四步:处理连续特征
#填补缺失值
impmean = SimpleImputer(missing_values = np.nan , strategy = 'mean')
weather_clear_cuthard_fpd.loc[:,sequen] = impmean.fit_transform(weather_clear_cuthard_fpd.loc[:,sequen])
weather_clear_cuthard_fpd.loc[:,sequen].isna().mean()
#标准化
ss = StandardScaler()
weather_clear_cuthard_fpd.loc[:,sequen] = ss.fit_transform(weather_clear_cuthard_fpd.loc[:,sequen])
第五步:处理离散特征
#填补缺失值
impmf = SimpleImputer(missing_values = np.nan , strategy = 'most_frequent')
weather_clear_cuthard_fpd.loc[:,cate] = impmf.fit_transform(weather_clear_cuthard_fpd.loc[:,cate])
weather_clear_cuthard_fpd.loc[:,cate].isna().mean()
#onehot
ohe = OneHotEncoder(categories='auto')
onehot_matrix = ohe.fit_transform(weather_clear_cuthard_fpd.loc[:,cate]).toarray()
#得到onehot的列名
ohe.get_feature_names()
#把onehot变成DataFrame,并修改列名
onehot_matrix = pd.DataFrame(onehot_matrix)
onehot_matrix.columns = ['x0_E', 'x0_ENE', 'x0_ESE', 'x0_N', 'x0_NE', 'x0_NNE', 'x0_NNW',
'x0_NW', 'x0_S', 'x0_SE', 'x0_SSE', 'x0_SSW', 'x0_SW', 'x0_W',
'x0_WNW', 'x0_WSW', 'x1_E', 'x1_ENE', 'x1_ESE', 'x1_N', 'x1_NE',
'x1_NNE', 'x1_NNW', 'x1_NW', 'x1_S', 'x1_SE', 'x1_SSE', 'x1_SSW',
'x1_SW', 'x1_W', 'x1_WNW', 'x1_WSW', 'x2_E', 'x2_ENE', 'x2_ESE',
'x2_N', 'x2_NE', 'x2_NNE', 'x2_NNW', 'x2_NW', 'x2_S', 'x2_SE',
'x2_SSE', 'x2_SSW', 'x2_SW', 'x2_W', 'x2_WNW', 'x2_WSW', 'x3_1',
'x3_No', 'x4_1', 'x4_2', 'x4_3', 'x4_4', 'x4_5', 'x4_6', 'x4_7',
'x4_8', 'x4_9', 'x4_10', 'x4_11', 'x4_12', 'x5_0.0', 'x5_1.0',
'x6_0.0', 'x6_1.0', 'x6_2.0', 'x6_3.0', 'x6_4.0', 'x6_5.0',
'x6_6.0', 'x6_7.0', 'x6_8.0', 'x6_9.0', 'x7_0.0', 'x7_1.0',
'x7_2.0', 'x7_3.0', 'x7_4.0', 'x7_5.0', 'x7_6.0', 'x7_7.0',
'x7_8.0', 'x7_9.0']
#将新的onehot特征与原来的连续特征拼接
weather_clear_cuthard_fpd_onehot = pd.concat([weather_clear_cuthard_fpd.loc[:,sequen],onehot_matrix],axis=1)
weather_clear_cuthard_fpd_onehot.head()