文章目录
预处理
基础概念
特征工程
数据越大,数据框架越简单
数据越小,数据框架越复杂
如有钱人喜欢用炸弹号,连续号
数据样本采集-抽样
异常值(空值)处理
特征预处理
特征选择
离职为1,其他为0
代码实现
注:不同于后期的数据建模,特征选择使用样本,进行小规模尝试
特征变换
放大差距的变换方法
对指化
把比较大的量级,通过对数的方法,得到直观的分级
如存款,使用log10
声音,使用分贝
地震,使用震级
离散化
归一化
归一化将数字映射到0-1,更直观,对比数据间距离
标准化
标准化,与样本分布相关
数值化
特征降维
求omega
特征衍生
代码实现
特征工程
import pandas as pd
df=pd.read_csv()
df=df.fillna(0)
upper_q=df["vol"].quantile(0.75)
lower_q=df["vol"].quantile(0.25)
q_int=upper_q-lower_q
k=1.5
df[df["vol"]>lower_q-k*q_int][df["vol"]<upper_q+k*q_int]
df[[True if item.startswith("d") else False for item in list(df["client_rfm"].values)]]
import numpy as np
import pandas as pd
import scipy.stats as ss
df=pd.DataFrame({"A":ss.norm.rvs(size=10),"B":ss.norm.rvs(size=10),"C":ss.norm.rvs(size=10),"D":np.random.randint(low=0,high=2,size=10)})
df
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
X=df.loc[:,["A","B","C"]]
Y=df.loc[:,["D"]]
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
过滤思想
skb=SelectKBest(k=2)
skb.fit(X,Y)
skb.transform(X)
包裹思想
rfe=RFE(estimator=SVR(kernel="linear"),n_features_to_select=2,step=1)
rfe.fit_transform(X,Y)
嵌入思想
#threshold重要性低于多少,被去掉
sfm=SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.1)
sfm.fit_transform(X,Y)
#threshold重要性低于多少,被去掉
sfm=SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.01)
sfm.fit_transform(X,Y)
特征变化
离散化-分箱
import numpy as np
import pandas as pd
lst=[6,8,10,15,16,24,25,40,67]
#等深,按照个数分
pd.qcut(lst,q=3)
pd.qcut(lst,q=3,labels=["low","medium","high"])
#等宽,按照区间长度分
pd.cut(lst,bins=3)
归一化
from sklearn. preprocessing import MinMaxScaler,StandardScaler
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1))
标准化
StandardScaler().fit_transform(np.array([1,0,0,0,0,0,0,0]).reshape(-1,1))
StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1))
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
LabelEncoder().fit_transform(np.array(["Down","Up","","Down"]).reshape(-1,1))
LabelEncoder().fit_transform(np.array(["Low","Medium","High","Medium"]).reshape(-1,1))
lb_encoder=LabelEncoder()
lb_tran_f=lb_encoder.fit_transform(np.array(["red","blue","yellow","yellow","green"]))
oht_encoder=OneHotEncoder().fit(lb_tran_f.reshape(-1,1))
oht_encoder.transform(lb_encoder.transform(np.array(["red","blue","yellow","yellow","green"])).reshape(-1,1))
oht_encoder.transform(lb_encoder.transform(np.array(["red","blue","yellow","yellow","green"])).reshape(-1,1)).toarray()
正规化
from sklearn.preprocessing import Normalizer
Normalizer(norm="l1").fit_transform(np.array([1,1,3,-1,2]).reshape(-1,1))
#正规化是对行进行的
Normalizer(norm="l1").fit_transform(np.array([[1,1,3,-1,2]]).reshape(1,-1))
Normalizer(norm="l2").fit_transform(np.array([[1,1,3,-1,2]]).reshape(1,-1))
LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X=np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
Y=np.array([1,1,1,2,2,2])
LinearDiscriminantAnalysis(n_components=1).fit_transform(X,Y)
fisher 分类器
clf=LinearDiscriminantAnalysis(n_components=1).fit(X,Y)
clf.predict([[0.8,1]])
以hr数据为例
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
#sl:false:minimaxscaler,true:standerdscalar
#le:last_evaluation,
#cr:False:LabelEncoding,True:OneHotEncoding
def hr_preprocessing(dn=False,ta=False,pn=False,hn=False,vol=False,risk=False,cr=False,lower_d=False,ld_n=1):
df=pd.read_csv(r'C:\Users\wenxiaoyu_intern\Documents\用户画像\shengji.csv')
#1.清洗数据
df=df.fillna(0)
#2.得到标注
label=df["shengji_type"]
df=df.drop("shengji_type",axis=1)
#df=df[df["date_nums"<1000][df["vol"]>0]]
#3.特征选择
#4.特征处理
scaler_lst=[dn,ta,pn,hn,vol,risk]
column_list=["date_nums","total_amount","plan_nums","holding_plan_nums","vol","risk"]
for i in range(len(scaler_lst)):
if not scaler_lst[i]:
df[column_list[i]]=\
MinMaxScaler().fit_transform(df[column_list[i]].values.reshape(-1,1)).reshape(1,-1)[0]
else:
df[column_list[i]]=\
StandardScaler().fit_transform(df[column_list[i]].values.reshape(-1,1)).reshape(1,-1)[0]
scaler_list=[cr]
column_list=["client_rfm"]
for i in range(len(scaler_list)):
if not scaler_list[i]:
if column_list[i]=="client_rfm":
df[column_list[i]]=[map_client_rfm(s) for s in df["client_rfm"].values]
else:
df[column_list[i]]=\
LabelEncoder().fit_transform(df[column_list[i]].values.reshape(-1,1)).reshape(1,-1)[0]
df[column_list[i]]=\
MinMaxScaler().fit_transform(df[column_list[i]].values.reshape(-1,1)).reshape(1,-1)[0]
else:
df=pd.get_dummies(df,columns=[column_list[0]])
if lower_d:
#return LinearDiscriminantAnalysis(n_components=ld_n)
return PCA(n_components=ld_n).fit_transform(df.values),label
return df,label
def map_client_rfm(s):
return dict([("sleep",0),("lost",1),("decline",2),("normal",3),("active",4),("new",5),("vip",6)]).get(s,0)
hr_preprocessing(True,cr=True,lower_d=True,ld_n=4)
hr_preprocessing(True,cr=False)
df