特征工程
import Ipyhon.display import Image
image(filename = r'C:.../.../...特征工程.png',width=600)
out
业务建模流程:
1)将业务抽象为分类or回归问题
2)定义标签,得到y
3)选取合适的样本,并匹配全部的信息作为特征的来源
4)特征工程+模型训练+模型评价与调优(相互之间可能会有交互)
5)输出模型报告
6)上线与监控
什么是特征?
以机器学习为背景,解释现象发生的单个特征或一组特征,将特征可度量化
特征工程包含的内容,不是唯一流程顺序不唯一:
1)基础特征构造
2)数据预处理
3)特征衍生
4)特征变换
5)特征筛选
预览数据
import pandas as pd
import numpy as np
df_train = pd.read_csv('train.csv')
df_train.head()
df_train.shape
out
(891,12)
df_train.info()
out
df_train.describe()
out
#变量的百分位及离群点
#实际上不做这种分析也没关系,从最终角度去筛选,会大大减少工作量
%matplotlib inline
df_train.boxplot(column='Age')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
np.random.seet(sum(map(ord,"distributions")))
sns.displot(df_train.Age,kde=True,bins=20,rug=Ture)
#判断一下y有几类
df_train.lable.unique()
from pyecharts import Bar3D
bar3d = Bar3D("2018年申请人数分布", width=1200, height=600)
x_axis = [
"12a", "1a", "2a", "3a", "4a", "5a", "6a", "7a", "8a", "9a", "10a", "11a",
"12p", "1p", "2p", "3p", "4p", "5p", "6p", "7p", "8p", "9p", "10p", "11p"
]
y_axis = [
"Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday", "Sunday"
]
data = [
[0, 0, 5], [0, 1, 1], [0, 2, 0], [0, 3, 0], [0, 4, 0], [0, 5, 0],
[0, 6, 0], [0, 7, 0], [0, 8, 0], [0, 9, 0], [0, 10, 0], [0, 11, 2],
[0, 12, 4], [0, 13, 1], [0, 14, 1], [0, 15, 3], [0, 16, 4], [0, 17, 6],
[0, 18, 4], [0, 19, 4], [0, 20, 3], [0, 21, 3], [0, 22, 2], [0, 23, 5],
[1, 0, 7], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0], [1, 5, 0],
[1, 6, 0], [1, 7, 0], [1, 8, 0], [1, 9, 0], [1, 10, 5], [1, 11, 2],
[1, 12, 2], [1, 13, 6], [1, 14, 9], [1, 15, 11], [1, 16, 6], [1, 17, 7],
[1, 18, 8], [1, 19, 12], [1, 20, 5], [1, 21, 5], [1, 22, 7], [1, 23, 2],
[2, 0, 1], [2, 1, 1], [2, 2, 0], [2, 3, 0], [2, 4, 0], [2, 5, 0],
[2, 6, 0], [2, 7, 0], [2, 8, 0], [2, 9, 0], [2, 10, 3], [2, 11, 2],
[2, 12, 1], [2, 13, 9], [2, 14, 8], [2, 15, 10], [2, 16, 6], [2, 17, 5],
[2, 18, 5], [2, 19, 5], [2, 20, 7], [2, 21, 4], [2, 22, 2], [2, 23, 4],
[3, 0, 7], [3, 1, 3], [3, 2, 0], [3, 3, 0], [3, 4, 0], [3, 5, 0],
[3, 6, 0], [3, 7, 0], [3, 8, 1], [3, 9, 0], [3, 10, 5], [3, 11, 4],
[3, 12, 7], [3, 13, 14], [3, 14, 13], [3, 15, 12], [3, 16, 9], [3, 17, 5],
[3, 18, 5], [3, 19, 10], [3, 20, 6], [3, 21, 4], [3, 22, 4], [3, 23, 1],
[4, 0, 1], [4, 1, 3], [4, 2, 0], [4, 3, 0], [4, 4, 0], [4, 5, 1],
[4, 6, 0], [4, 7, 0], [4, 8, 0], [4, 9, 2], [4, 10, 4], [4, 11, 4],
[4, 12, 2], [4, 13, 4], [4, 14, 4], [4, 15, 14], [4, 16, 12], [4, 17, 1],
[4, 18, 8], [4, 19, 5], [4, 20, 3], [4, 21, 7], [4, 22, 3], [4, 23, 0],
[5, 0, 2], [5, 1, 1], [5, 2, 0], [5, 3, 3], [5, 4, 0], [5, 5, 0],
[5, 6, 0], [5, 7, 0], [5, 8, 2], [5, 9, 0], [5, 10, 4], [5, 11, 1],
[5, 12, 5], [5, 13, 10], [5, 14, 5], [5, 15, 7], [5, 16, 11], [5, 17, 6],
[5, 18, 0], [5, 19, 5], [5, 20, 3], [5, 21, 4], [5, 22, 2], [5, 23, 0],
[6, 0, 1], [6, 1, 0], [6, 2, 0], [6, 3, 0], [6, 4, 0], [6, 5, 0],
[6, 6, 0], [6, 7, 0], [6, 8, 0], [6, 9, 0], [6, 10, 1], [6, 11, 0],
[6, 12, 2], [6, 13, 1], [6, 14, 3], [6, 15, 4], [6, 16, 0], [6, 17, 0],
[6, 18, 0], [6, 19, 0], [6, 20, 1], [6, 21, 2], [6, 22, 2], [6, 23, 6]
]
range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
'#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
bar3d.add(
"",
x_axis,
y_axis,
[[d[1], d[0], d[2]] for d in data],
is_visualmap=True,
visual_range=[0, 20],
visual_range_color=range_color,
grid3d_width=200,
grid3d_depth=80,
is_grid3d_rotate=True, # 自动旋转
grid3d_rotate_speed=180, # 旋转速度
)
bar3d
缺失值处理
pandas fillna
sklearn Imputer
df_train['Age'].sample(10)
df_train['Age'].fillna(value=df_train['Age'].mean()).sample(10)
from sklrean.perprocession import Imputer
imp = Imputer(missing_value='NaN',stragtegy='mean',axis=0)
age=imp.fit_transform(df_train[['Age']].values).copy()
df_train.loc[:,'Age'] = df_train['Age'].fillna(value=df.train['Age'].mean()).copy()
df_train.head()
数值型:数值缩放
# 取对数等变换
import numpy as np
log_age = df_train['Age'].apply(lambda x:np.log(x))
df_train.loc[:,'log_age'] = log_age
df_train.head(10)
# 幅度缩放,最大最小值缩放到[0,1]区间内
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
fare_trans = mm_scaler.fit_transform(df_train[['Fare']])
# 幅度缩放,将每一列的数据标准化为正态分布的
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
fare_std_trans = std_scaler.fit_transform(df_train[['Fare']])
#中位数或者四分位数去中心化数据,对异常值不敏感
from sklearn.preprocessing import robust_scale
fare_robust_trans = robust_scale(df_train[['Fare','Age']])
#将同一行数据规范化,前面的同一变为1以内也可以达到这样的效果
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
fare_normal_trans = normalizer.fit_transform(df_train[['Age','Fare']])
fare_normal_trans
统计值
# 最大最小值
max_age = df_train['Age'].max()
min_age = df_train["Age"].min()
# 分位数,极值处理,我们最粗暴的方法就是将前后1%的值抹去
age_quarter_01 = df_train['Age'].quantile(0.01)
print(age_quarter_01)
age_quarter_99 = df_train['Age'].quantile(0.99)
print(age_quarter_99)
#### 四则运算
df_train.loc[:,'family_size'] = df_train['SibSp']+df_train['Parch']+1
df_train.head()
df_train.loc[:,'tmp'] = df_train['Age']*df_train['Pclass'] + 4*df_train['family_size']
df_train.head()
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
df_train[['SibSp','Parch']].head()
poly_fea = poly.fit_transform(df_train[['SibSp','Parch']])
poly_fea
离散化/分箱/分桶
#等距切分
df_train.loc[:, 'fare_cut'] = pd.cut(df_train['Fare'], 20)
df_train.head()
df_train['fare_cut'].unique()
# 等频切分
df_train.loc[:,'fare_qcut'] = pd.qcut(df_train['Fare'], 10)
df_train.head()
df_train = df_train.sort_values('Fare')
alist = list(set(df_train['fare_qcut']))
badrate = {}
for x in alist:
a = df_train[df_train.fare_qcut == x]
bad = a[a.label == 1]['label'].count()
good = a[a.label == 0]['label'].count()
badrate[x] = bad/(bad+good)
f = zip(badrate.keys(),badrate.values())
f = sorted(f,key = lambda x : x[1],reverse = True )
badrate = pd.DataFrame(f)
badrate.columns = pd.Series(['cut','badrate'])
badrate = badrate.sort_values('cut')
print(badrate.head())
badrate.plot('cut','badrate')
#OneHot encoding/独热向量编码
# category类型
df_train.info()
embarked_oht = pd.get_dummies(df_train[['Embarked']])
embarked_oht.head()
fare_qcut_oht = pd.get_dummies(df_train[['fare_qcut']])
fare_qcut_oht.head()
时间型:日期处理
car_sales = pd.read_csv('car_data.csv')
car_sales.head()
car_sales.info()
car_sales.describe()
car_sales['date_t'].dtype
car_sales.loc[:,'date'] = pd.to_datetime(car_sales['date_t'])
car_sales.info()
car_sales.head()
#取出关键时间信息
# 取出几月份
car_sales.loc[:,'month'] = car_sales['date'].dt.month
car_sales.head()
# 取出来是几号
car_sales.loc[:,'dom'] = car_sales['date'].dt.day
# 取出一年当中的第几天
car_sales.loc[:,'doy'] = car_sales['date'].dt.dayofyear
# 取出星期几
car_sales.loc[:,'dow'] = car_sales['date'].dt.dayofweek
car_sales.head()
文本型
from pyecharts import WordCloud
name = [
'梅老师', '金融', '风控', '实战', '人长得帅' ,
'机器学习', '深度学习', '异常检测', '知识图谱', '社交网络', '图算法',
'迁移学习', '不均衡学习', '反欺诈', '数据挖掘', '评分卡',
'集成算法', '模型融合','python', '学员聪明']
value = [
10000, 6181, 4386, 4055, 2467, 2244, 1898, 1484, 1112,
965, 847, 582, 555, 550, 462, 366, 360, 282, 273, 265]
wordcloud = WordCloud(width=800, height=600)
wordcloud.add("", name, value, word_size_range=[30, 80])
#词袋模型
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
'This is a very good class',
'students are very very very good',
'This is the third sentence',
'Is this the last doc',
'PS teacher Mei is very very handsome'
]
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()
X.toarray()
vec = CountVectorizer(ngram_range=(1,3))
X_ngram = vec.fit_transform(corpus)
vec.get_feature_names()
X_ngram.toarray()
TF-IDT
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_X = tfidf_vec.fit_transform(corpus)
tfidf_vec.get_feature_names()
tfidf_X.toarray()
#组合特征
# 借助条件去判断获取组合特征
df_train.loc[:,'alone'] = (df_train['SibSp']==0)&(df_train['Parch']==0)
df_train.head()