1、数据的合并、标签的构建
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
#1、合并各个sheet
df1=pd.read_excel('D:\新大陆\笔试\【原始数据】20220926_2023标签数据集.xlsx',sheet_name='属性信息')
df2=pd.read_excel('D:\新大陆\笔试\【原始数据】20220926_2023标签数据集.xlsx',sheet_name='商机信息')
df3=pd.read_excel('D:\新大陆\笔试\【原始数据】20220926_2023标签数据集.xlsx',sheet_name='其他行为信息')
k=pd.merge(df1,df2,how='inner',on='供应商会员服务序号(样本主键)')
df=pd.merge(k,df3,how='inner',on='供应商会员服务序号(样本主键)')
#2、更换列名
co=pd.read_excel('D:\新大陆\笔试\【原始数据】20220926_2023标签数据集.xlsx',sheet_name='Sheet1')
list(df.columns)==list(co['c'])#确认一下中文名称是否完全一致
df.columns=list(co['y'])#更改为简略列名
#3、新建正负标签
df_c=df.copy()#防止翻车,先建个副本
df.isnull().sum()/len(df)#统计各个特征缺失值比例
df.drop(['source'], axis=1,inplace=True)#source占比过高直接删掉
df['label_n'].describe()
df[df['label_n']==5]=np.nan#选定值转nan
df.dropna(axis=0,inplace=True)#过滤掉不确定是否续约的样本
df['label']=df['label_n'].apply(lambda x:0 if x==3 else 1)#设置正负标签,虚拟函数将label中的3替换为0;1、2、4替换为1
df['label'].describe()#看一下标签分布
n_sample = len(df)
n_1_sample = df['label'].value_counts()[1]
n_0_sample = df['label'].value_counts()[0]#统计正负标签占比
print('样本个数:{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample,n_1_sample/n_sample,n_0_sample/n_sample))#正负样本大约7:3,存在轻微的样本不均衡问题
print(1-len(df)/len(df_c))#统计过滤样本占比
#4、特征工程
#4.1
data=df.iloc[:,1:]
data.info()#看一下特征的数据类型
data.describe([0.01,0.1,0.25,.5,.75,.9,.99]).T#看各个特征数值分布
d=dict((y,x) for x,y in enumerate(np.unique(data['city'])))#文本变量编码
data['city']=data['city'].map(d)
for i in list(data.columns):#遍历各个特征3б法则检验有无异常值
std = data[i].std()
u = data[i].mean()
error = data[np.abs(data[i] - u) > 3 * std]
data_c = data[np.abs(data[i] - u) <= 3 * std]#无异常
data.drop(['com_id'], axis=1,inplace=True)
for i in list(data.columns)[1:-1]:
print(i,stats.pearsonr(data['label'],data[i]))#打印一下各个特征对于标签的相关性系数和显著性
c=data.iloc[:,1:].corr()
plt.subplots(figsize = (17,12))
sns.heatmap(c,annot = True,vmax = 1,square = True,cmap = "Reds")
plt.show()#画一下热力图
指标名称更换如下所示:
热力图如下所示:
2、Permutation Importance筛选特征
#4.2
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier as XGBC
from sklearn.linear_model import LogisticRegression as LR
import eli5
from eli5.sklearn import PermutationImportance
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score as ROC
from sklearn.metrics import recall_score as Recall
from sklearn.metrics import precision_score as Precision
from sklearn.metrics import roc_auc_score
X=data.iloc[:,1:-1]
Y=data.iloc[:,-1]
sm = SMOTE(random_state=42) #为了捕获少数类,加未续约的假数据
x,y = sm.fit_resample(X,Y)
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3)
clf=XGBC().fit(xtrain,ytrain)
clf.score(xtest,ytest)#看一下XGB初始得分(填充后的样本)
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.3)
clf.score(Xtest,Ytest)#原始样本初始得分
perm = PermutationImportance(clf, random_state=0).fit(xtest, ytest)
eli5.show_weights(perm, feature_names = xtest.columns.tolist())#PermutationImportance打印出特征重要性排序
xtrain1=xtrain[['year','interval_is','interval_bf','day_d1','num_d2','day_i1','mean_b','num_i1','interval_if','num_i3','mean_i','num_d3','num_i2','day_i2']]#,'day_d3','day_i3','num_d1','day_d2','city''interval_bs',]]
xtest1=xtest[['year','interval_is','interval_bf','day_d1','num_d2','day_i1','mean_b','num_i1','interval_if','num_i3',
'mean_i','num_d3','num_i2','day_i2']]#,'day_d3','day_i3','num_d1','day_d2','city']],'interval_bs'
clf=XGBC().fit(xtrain1,ytrain)
clf.score(xtest1,ytest)#根据特征重要性排序遍历试得分,最终筛选出14个
lr=LR().fit(xtrain,ytrain)
lr.score(xtest,ytest)
lr=LR().fit(xtrain1,ytrain)
lr.score(xtest1,ytest)#逻辑回归再次检验特征筛选是否有效
y_proba=clf.predict_proba(xtest1)
prob=y_proba[:,1]
y_pre=(prob>=0.6).astype(int)
print('模型召回率:{}'.format(Recall(ytest,y_pre)))
print('模型准确率:{}'.format(Precision(ytest,y_pre)))
print('模型AUC:{}'.format(roc_auc_score(ytest,y_pre))))#看效果