特征工程

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

第一步:剔除异常样本、重置索引、将特征与标签分离

一定要先剔除异常样本,再提取标签。
原因:假设一共有100个样本,如果先按照最初的样本顺序提取100个标签,然后再剔除异常样本,剔除后还剩下90个非异常样本,那么这90个非异常样本就和刚才的100个标签无法一一对应了。

#将离散特征与连续特征分开
cate = weather.columns[weather.dtypes == 'object'].tolist()
cloud = ['Cloud9am','Cloud3pm']
cate = cate + cloud
sequen = weather.columns.tolist()
for i in cate:
    sequen.remove(i)
sequen

#给每个连续特征下的样本标记Z分数
for col in sequen:
    weather.loc[:,col+'_z'] = (weather.loc[:,col] - weather.loc[:,col].mean()) / weather.loc[:,col].std()
weather.head()

#剔除Z分数大于2的异常样本
for col in sequen:
    weather_clear = weather.drop(weather[weather[col+'_z'] > 2].index)
weather_clear

#剔除完异常值,从连续特征列表中去除Z分数列
for col in sequen:
    del weather_clear[col+'_z']
weather_clear

#重新配置索引
weather_clear.index = range(weather_clear.shape[0])

#提取标签,存储数据
weather_clear.loc[weather_clear['RainTomorrow'] == 'Yes','RainTomorrow'] = 1
weather_clear.loc[weather_clear['RainTomorrow'] == 'Yes','RainTomorrow'] = 0
target = weather_clear['RainTomorrow']
weather_clear.to_csv('./handled/ML/weather_clear.csv')
target.to_csv('./handled/ML/target.csv')

第二步:处理困难特征

#匿名函数
weather_clear_cuthard.loc[:,'Date'] = weather_clear_cuthard.loc[:,'Date'].apply(lambda x:int(x.split('-')[1]))

#修改列名
weather_clear_cuthard = weather_clear_cuthard.rename(columns = {'Date':'Month'})

#特征创造
weather_clear_cuthard.loc[weather_clear_cuthard['Rainfall'] >= 1,'RainToday'] = 1
weather_clear_cuthard.loc[weather_clear_cuthard['Rainfall'] < 1,'RainToday'] = 0

第三步:重新梳理离散、连续特征

经过第二步处理困难特征以后,有些特征名被修改,还有一些新的特征被构建,使得我们不得不重新梳理特征

total_columns = weather_clear_cuthard.columns.tolist()
cate = weather_clear_cuthard.columns[weather_clear_cuthard.dtypes == 'object'].tolist()

cate.append('Month')
cate.append('RainToday')
cloud = ['Cloud9am','Cloud3pm']
cate = cate + cloud

for col in cate:
    total_columns.remove(col)
sequen = total_columns

len(weather_clear_cuthard.columns.tolist()) == len(cate) + len(sequen)

第四步:处理连续特征

#填补缺失值
impmean = SimpleImputer(missing_values = np.nan , strategy = 'mean')
weather_clear_cuthard_fpd.loc[:,sequen] = impmean.fit_transform(weather_clear_cuthard_fpd.loc[:,sequen])
weather_clear_cuthard_fpd.loc[:,sequen].isna().mean()

#标准化
ss = StandardScaler()
weather_clear_cuthard_fpd.loc[:,sequen] = ss.fit_transform(weather_clear_cuthard_fpd.loc[:,sequen])

第五步:处理离散特征

#填补缺失值
impmf = SimpleImputer(missing_values = np.nan , strategy = 'most_frequent')
weather_clear_cuthard_fpd.loc[:,cate] = impmf.fit_transform(weather_clear_cuthard_fpd.loc[:,cate])
weather_clear_cuthard_fpd.loc[:,cate].isna().mean()

#onehot
ohe = OneHotEncoder(categories='auto')
onehot_matrix = ohe.fit_transform(weather_clear_cuthard_fpd.loc[:,cate]).toarray()

#得到onehot的列名
ohe.get_feature_names()

#把onehot变成DataFrame,并修改列名
onehot_matrix = pd.DataFrame(onehot_matrix)
onehot_matrix.columns = ['x0_E', 'x0_ENE', 'x0_ESE', 'x0_N', 'x0_NE', 'x0_NNE', 'x0_NNW',
       'x0_NW', 'x0_S', 'x0_SE', 'x0_SSE', 'x0_SSW', 'x0_SW', 'x0_W',
       'x0_WNW', 'x0_WSW', 'x1_E', 'x1_ENE', 'x1_ESE', 'x1_N', 'x1_NE',
       'x1_NNE', 'x1_NNW', 'x1_NW', 'x1_S', 'x1_SE', 'x1_SSE', 'x1_SSW',
       'x1_SW', 'x1_W', 'x1_WNW', 'x1_WSW', 'x2_E', 'x2_ENE', 'x2_ESE',
       'x2_N', 'x2_NE', 'x2_NNE', 'x2_NNW', 'x2_NW', 'x2_S', 'x2_SE',
       'x2_SSE', 'x2_SSW', 'x2_SW', 'x2_W', 'x2_WNW', 'x2_WSW', 'x3_1',
       'x3_No', 'x4_1', 'x4_2', 'x4_3', 'x4_4', 'x4_5', 'x4_6', 'x4_7',
       'x4_8', 'x4_9', 'x4_10', 'x4_11', 'x4_12', 'x5_0.0', 'x5_1.0',
       'x6_0.0', 'x6_1.0', 'x6_2.0', 'x6_3.0', 'x6_4.0', 'x6_5.0',
       'x6_6.0', 'x6_7.0', 'x6_8.0', 'x6_9.0', 'x7_0.0', 'x7_1.0',
       'x7_2.0', 'x7_3.0', 'x7_4.0', 'x7_5.0', 'x7_6.0', 'x7_7.0',
       'x7_8.0', 'x7_9.0']

#将新的onehot特征与原来的连续特征拼接
weather_clear_cuthard_fpd_onehot = pd.concat([weather_clear_cuthard_fpd.loc[:,sequen],onehot_matrix],axis=1)
weather_clear_cuthard_fpd_onehot.head()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值