# # 1.案例实战 - 股票客户流失预警模型 # 1.读取数据 import pandas as pd df = pd.read_excel('股票客户流失.xlsx') df.head() # 2.划分特征变量和目标变量 X = df.drop(columns='是否流失') y = df['是否流失'] # 3.划分训练集和测试集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) X_train.head() y_train.head() X_test.head() y_test.head() # 4.模型搭建 from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(X_train, y_train) # 5.模型使用1 - 预测数据结果 y_pred = model.predict(X_test) print(y_pred[0:100]) a = pd.DataFrame() a['预测值'] = list(y_pred) a['实际值'] = list(y_test) a.head() from sklearn.metrics import accuracy_score score = accuracy_score(y_pred, y_test) print(score) model.score(X_test, y_test) # 6.模型使用2 - 预测概率 y_pred_proba = model.predict_proba(X_test) print(y_pred_proba[0:5]) a = pd.DataFrame(y_pred_proba, columns=['不流失概率', '流失概率']) a.head() print(y_pred_proba[:,1]) # 7.查看各个特征变量的系数(额外知识点,供参考) print(model.coef_) print(model.intercept_) import numpy as np for i in range(5): print(1 / (1 + np.exp(-(np.dot(X_test.iloc[i], model.coef_.T) + model.intercept_)))) # 1.读取数据 import pandas as pd df = pd.read_excel('股票客户流失.xlsx') # 2.划分特征变量和目标变量 X = df.drop(columns='是否流失') y = df['是否流失'] # 3.划分训练集和测试集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 4.模型搭建 from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(X_train, y_train) # 5.模型使用1 - 预测数据结果 y_pred = model.predict(X_test) print(y_pred[0:100]) from sklearn.metrics import accuracy_score score = accuracy_score(y_pred, y_test) print(score) # 6.模型使用2 - 预测概率 y_pred_proba = model.predict_proba(X_test) print(y_pred_proba[0:5])
# # 随机森林模型的代码实现 from sklearn.ensemble import RandomForestClassifier X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] y = [0, 0, 0, 1, 1] model = RandomForestClassifier(n_estimators=10, random_state=123) model.fit(X, y) print(model.predict([[5, 5]])) from sklearn.ensemble import RandomForestRegressor X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] y = [1, 2, 3, 4, 5] model = RandomForestRegressor(n_estimators=10, random_state=123) model.fit(X, y) print(model.predict([[5, 5]]))
# # XGBoost算法的简单代码实现 # 1.分类模型 from xgboost import XGBClassifier import numpy as np X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) y = [0, 0, 0, 1, 1] model = XGBClassifier() model.fit(X, y) print(model.predict([[5, 5]])) # 2.回归模型 from xgboost import XGBRegressor X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] y = [1, 2, 3, 4, 5] model = XGBRegressor() model.fit(X, y) print(model.predict([[5, 5]]))
# # 10.2 XGBoost算法案例实战1 - 金融反欺诈模型 # **10.2.2 模型搭建** # 1.读取数据 import pandas as pd df = pd.read_excel('信用卡交易数据.xlsx') df.head() # 2.提取特征变量和目标变量 X = df.drop(columns='欺诈标签') y = df['欺诈标签'] # 3.划分训练集和测试集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # 4.模型训练及搭建 from xgboost import XGBClassifier clf = XGBClassifier(n_estimators=100, learning_rate=0.05) clf.fit(X_train, y_train) # **10.2.3 模型预测及评估** y_pred = clf.predict(X_test) print(y_pred) a = pd.DataFrame() # 创建一个空DataFrame a['预测值'] = list(y_pred) a['实际值'] = list(y_test) a.head() from sklearn.metrics import accuracy_score score = accuracy_score(y_pred, y_test) print(score) print(clf.score(X_test, y_test)) y_pred_proba = clf.predict_proba(X_test) print(y_pred_proba[0:5]) # 查看前5个预测的概率 from sklearn.metrics import roc_curve fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1]) import matplotlib.pyplot as plt plt.plot(fpr, tpr) plt.show() from sklearn.metrics import roc_auc_score score = roc_auc_score(y_test, y_pred_proba[:,1]) print(score) print(clf.feature_importances_) features = X.columns # 获取特征名称 importances = clf.feature_importances_ # 获取特征重要性 importances_df = pd.DataFrame() importances_df['特征名称'] = features importances_df['特征重要性'] = importances importances_df.sort_values('特征重要性', ascending=False) # **10.2.4 模型参数调优** from sklearn.model_selection import GridSearchCV parameters = {'max_depth': [1, 3, 5], 'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1, 0.2]} # 指定模型中参数的范围 clf = XGBClassifier() # 构建模型 grid_search = GridSearchCV(clf, parameters, scoring='roc_auc', cv=5) grid_search.fit(X_train, y_train) # 传入数据 print(grid_search.best_params_) # 输出参数的最优值 clf = XGBClassifier(max_depth=1, n_estimators=100, learning_rate=0.05) clf.fit(X_train, y_train) y_pred_proba = clf.predict_proba(X_test) from sklearn.metrics import roc_auc_score score = roc_auc_score(y_test, y_pred_proba[:,1]) print(score)
# # LightGBM算法的简单代码实现 # 1.分类模型 from lightgbm import LGBMClassifier X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] y = [0, 0, 0, 1, 1] model = LGBMClassifier() model.fit(X, y) print(model.predict([[5, 5]])) # 2.回归模型 from lightgbm import LGBMRegressor X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] y = [1, 2, 3, 4, 5] model = LGBMRegressor() model.fit(X, y) print(model.predict([[5, 5]]))
# # 10.5 LightGBM算法案例实战1 - 客户违约预测模型 # **10.5.2 模型搭建** # 1.读取数据 import pandas as pd df = pd.read_excel('客户信息及违约表现.xlsx') df.head() # 2.提取特征变量和目标变量 X = df.drop(columns='是否违约') Y = df['是否违约'] # 3.划分训练集和测试集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123) # 4.模型训练及搭建 from lightgbm import LGBMClassifier model = LGBMClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) print(y_pred) a = pd.DataFrame() a['预测值'] = list(y_pred) a['实际值'] = list(y_test) a.head() from sklearn.metrics import accuracy_score score = accuracy_score(y_pred, y_test) print(score) print(model.score(X_test, y_test)) y_pred_proba = model.predict_proba(X_test) from sklearn.metrics import roc_curve fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1]) import matplotlib.pyplot as plt plt.plot(fpr, tpr) plt.show() from sklearn.metrics import roc_auc_score score = roc_auc_score(y_test, y_pred_proba[:,1]) print(score) print(model.feature_importances_) features = X.columns importances = model.feature_importances_ importances_df = pd.DataFrame() importances_df['特征名称'] = features importances_df['特征重要性'] = importances importances_df.sort_values('特征重要性', ascending=False) # **10.5.4 模型参数调优** from sklearn.model_selection import GridSearchCV parameters = {'num_leaves': [10, 15, 31], 'n_estimators': [10, 20, 30], 'learning_rate': [0.05, 0.1, 0.2]} model = LGBMClassifier() # 构建分类器 grid_search = GridSearchCV(model, parameters, scoring='roc_auc', cv=5) grid_search.fit(X_train, y_train) print(grid_search.best_params_) model = LGBMClassifier(num_leaves=15, n_estimators=20,learning_rate=0.1) model.fit(X_train, y_train) y_pred_proba = model.predict_proba(X_test) from sklearn.metrics import roc_curve fpr, tpr, thres = roc_curve(y_test, y_pred_proba[:,1]) import matplotlib.pyplot as plt plt.plot(fpr, tpr) plt.show() y_pred_proba = model.predict_proba(X_test) from sklearn.metrics import roc_auc_score score = roc_auc_score(y_test, y_pred_proba[:, 1]) print(score)
####决策树
import sklearn from sklearn.model_selection import train_test_split from sklearn import tree #导入tree模块 from sklearn.datasets import load_wine from math import log2 import pandas as pd import graphviz import treeplotter #导入数据集 wine = load_wine() X = wine.data #X Y = wine.target #Y features_name = wine.feature_names print(wine) print(features_name) pd.concat([pd.DataFrame(X),pd.DataFrame(Y)],axis=1) #打印数据 #划分数据集,数据集划分为测试集占20%; x_train, x_test, y_train, y_test = train_test_split( X, Y,test_size=0.2) # print(x_train.shape) #(142, 13) # print(x_test.shape) #(36, 13) #采用C4.5算法进行计算 #获取模型 model = tree.DecisionTreeClassifier(criterion="entropy",splitter="best",max_depth=None,min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_features=None, random_state=None,max_leaf_nodes=None,class_weight=None); model.fit(x_train,y_train) score = model.score(x_test,y_test) y_predict = model.predict(x_test) print('准确率为:',score) print(y_predict) print(y_test) #准确率为: 0.9444444444444444 pd.concat([pd.DataFrame(x_test),pd.DataFrame(y_test),pd.DataFrame(y_predict)],axis=1) #打印数据,对测试集的预测类别标签和真实标签进行对比 #采用CART算法进行计算 #获取模型 model = tree.DecisionTreeClassifier(criterion="gini",splitter="best",max_depth=None,min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_features=None, random_state=None,max_leaf_nodes=None,class_weight=None); model.fit(x_train,y_train) score = model.score(x_test,y_test) y_predict = model.predict(x_test) print('准确率为:',score) #准确率为: 1.0 feature_name = ['酒精','苹果酸','灰','灰的碱性','镁','总酚','类黄酮','非黄烷类酚类','花青素','颜色强度','色调','od280/od315稀释葡萄酒','脯氨酸'] dot_data = tree.export_graphviz(model ,out_file=None ,feature_names=feature_name ,class_names=['二锅头','苦荞','江小白'] ,filled=True ,rounded=True) graph = graphviz.Source(dot_data) graph #graph.render('tree')
##########################
import le as le import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime from tqdm import tqdm import os from sklearn.preprocessing import LabelEncoder from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import MinMaxScaler import lightgbm as lgb from catboost import CatBoostRegressor import warnings from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss warnings.filterwarnings('ignore') from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression pd.set_option('display.width',None) current_dir=os.getcwd() file_dir=current_dir file_name=file_dir+'\\train.csv' try: data=pd.read_csv(file_name,encoding='gbk') except: data=pd.read_csv(file_name,encoding='utf-8') have_null_fea_dict = (data.isnull().sum()/len(data)).to_dict() fea_null_moreThanHalf = {} for key,value in have_null_fea_dict.items(): if value > 0.5: fea_null_moreThanHalf[key] = value fea_null_moreThanHalf missing = data.isnull().sum()/len(data) missing = missing[missing > 0] missing.sort_values(inplace=True) missing.plot.bar() one_value_fea = [x for x in data.columns if data[x].nunique() <= 1] #查看训练集中特征属性只有一值的特征 print(one_value_fea) from sklearn.metrics import roc_curve y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 1] y_true = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1] FPR,TPR,thresholds=roc_curve(y_true, y_pred) plt.title('ROC') plt.plot(FPR, TPR,'b') plt.plot([0,1],[0,1],'r--') plt.ylabel('TPR') plt.xlabel('FPR') #过滤数值型类别特征 numerical_fea = list(data.select_dtypes(exclude=['object']).columns) #数值型特征 category_fea = list(filter(lambda x: x not in numerical_fea,list(data.columns))) #类别型特征 def get_numerical_serial_fea(data,feas): numerical_serial_fea = [] numerical_noserial_fea = [] for fea in feas: temp = data[fea].nunique() if temp <= 10: numerical_noserial_fea.append(fea) continue numerical_serial_fea.append(fea) return numerical_serial_fea,numerical_noserial_fea numerical_noserial_fea = get_numerical_serial_fea(data,numerical_fea) numerical_serial_fea=get_numerical_serial_fea(data,numerical_fea) print(numerical_noserial_fea) print('分界线----------------------------------------') #过滤数值型类别特征 遍历得出连续型变量 def get_numerical_serial_fea(data,feas): numerical_serial_fea = [] numerical_noserial_fea = [] for fea in feas: temp = data[fea].nunique() if temp <= 10: numerical_noserial_fea.append(fea) continue numerical_serial_fea.append(fea) return numerical_serial_fea,numerical_noserial_fea numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(data,numerical_fea) print(numerical_serial_fea) ''' #每个数字特征得分布可视化 f = pd.melt(data, value_vars=numerical_serial_fea) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False) g = g.map(sns.distplot, "value") plt.figure(figsize=(9,6)) plt.suptitle('Transaction Values Distribution', fontsize=22) plt.subplot(221) sub_plot_1 = sns.distplot(data['loanAmnt']) sub_plot_1.set_title("loanAmnt Distribuition", fontsize=18) sub_plot_1.set_xlabel("") sub_plot_1.set_ylabel("Probability", fontsize=15) plt.subplot(222) sub_plot_2 = sns.distplot(np.log(data['loanAmnt'])) sub_plot_2.set_title("loanAmnt (Log) Distribuition", fontsize=18) sub_plot_2.set_xlabel("") sub_plot_2.set_ylabel("Probability", fontsize=15) plt.show() ''' #工作年限单一变量可视化 plt.figure(figsize=(8, 8)) sns.barplot(data["employmentLength"].value_counts(dropna=False)[:20], data["employmentLength"].value_counts(dropna=False).keys()[:20]) print(data.isnull().sum()) print(data['issueDate'].min()) #转化成时间格式 转化成2007-06-09类的形式 计算从2007-06-01的天数 for data in [data]: data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') #2007-06-01为最早份 #构造时间特征 data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days print(data['issueDateDT'].min()) #数据清洗 #把employmentLength_to_int中的years去掉 def employmentLength_to_int(s): if pd.isnull(s): return s else: return np.int8(s.split()[0]) for data in [data]: data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True) data['employmentLength'].replace('< 1 year', '0 years', inplace=True) data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int) print(data['employmentLength'].value_counts(dropna=False).sort_index()) for f in numerical_serial_fea: print(f, '类型数:', data[f].nunique()) print(data.isnull().sum()) #按照中位数填充数值型特征 data[numerical_fea]=data[numerical_fea].fillna(data[numerical_fea]).median() #按照中众数填充类别型特征 data[category_fea]=data[category_fea].fillna(data[category_fea].mode()) #对earliesCreditLine进行预处理 data['earliesCreditLine'].sample(5) data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:])) print(data['earliesCreditLine']) data['grade']=data['grade'].map({'A':1,'B':2,'C':3,"D":4,'E':5,'F':6,'G':7}) def find_outliers_by_3segama(data1,fea): data_std = np.std(data1[fea]) data_mean = np.mean(data1[fea]) outliers_cut_off = data_std * 3 lower_rule = data_mean - outliers_cut_off upper_rule = data_mean + outliers_cut_off data1[fea+'_outliers'] = data1[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值') return data for fea in numerical_fea: data = find_outliers_by_3segama(data,fea) print(data[fea+'_outliers'].value_counts()) print(data.groupby(fea+'_outliers')['isDefault'].sum()) print('*'*10) #清除异常值 for fea in numerical_fea: data = data[data[fea+'_outliers']=='正常值'] data = data.reset_index(drop=True) for col in ['grade', 'subGrade']: temp_dict = data.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'}) temp_dict.index = temp_dict[col].values temp_dict = temp_dict[col + '_target_mean'].to_dict() data[col + '_target_mean'] = data[col].map(temp_dict) # 其他衍生变量 mean 和 std for df in [data]: for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']: df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean') df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std') #labelEncode 直接放入树模型中 for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']): le = LabelEncoder() le.fit(list(data[col].astype(str).values)) data[col] = le.transform(list(data[col].astype(str).values)) print('Label Encoding 完成')
#########################
try: data=pd.read_csv(file_name,encoding='utf-8') except: data=pd.read_csv(file_name,encoding='gbk') print(data) print(data.isnull().sum()) from sklearn import tree from sklearn.model_selection import train_test_split x_columns=['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak'] x=data[x_columns] y=data['HeartDisease'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) model = tree.DecisionTreeClassifier(criterion='gini', max_depth=7, max_leaf_nodes=6) model.fit(x_train,y_train) import sklearn.metrics as metrics #训练集auc prob_train=model.predict_proba(x_train) fpr,tpr,threshold=metrics.roc_curve(y_train,prob_train[:,1]) # prob_test_1:预测为1的概率 auc_value=metrics.auc(fpr,tpr)# 得到AUC print('训练集auc',auc_value) prob=model.predict_proba(x_test) fpr,tpr,threshold=metrics.roc_curve(y_test,prob[:,1]) # prob_test_1:预测为1的概率 auc_value=metrics.auc(fpr,tpr)# 得到AUC print('测试集auc',auc_value)
#############
import os from scipy.stats import chi2 import numpy as np import pandas as pd import math from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression pd.set_option('display.width',None) def compute_woe(x,y): #构造新表 s_data=pd.DataFrame({'x':x,'y':y}) s_data['count']=[1]*s_data.shape[0] #透视表的方式计算woe pivot_data=s_data.pivot_table(index='x', columns='y', values='count', aggfunc='count') #透视后填充空值 pivot_data.fillna(value=0,inplace=True) #更名 pivot_data.rename(columns={0:'good',1:'bad'},inplace=True) #统计good、bad占比 pivot_data['good_rate']=pivot_data['good']/pivot_data['good'].sum() pivot_data['bad_rate']=pivot_data['bad']/pivot_data['bad'].sum() pivot_data['rate']=(pivot_data['bad_rate']+0.01)/(pivot_data['good_rate']+0.01) #将比值取对数 pivot_data['woe']=pivot_data['rate'].map(lambda x:math.log(x)) #计算iv值 pivot_data['iv']=pivot_data['woe']*(pivot_data['bad_rate']-pivot_data['good_rate']) iv_value=pivot_data['iv'].sum() #构造字典对应关系 woe_dict=dict(zip(list(pivot_data.index),list(pivot_data['woe']))) s_data['x_woe']=s_data['x'].map(woe_dict) return s_data['x_woe'],iv_value def chi_bin(x,y,max_bins,confidence): data=pd.DataFrame({'x':x,'y':y}) data['count']=[1]*data.shape[0] #卡方分箱 cut_bins=pd.qcut(data['x'],20,retbins=True,duplicates='drop')[1] #通过cut把数据切割 data['x_cut']=pd.cut(data['x'],bins=cut_bins,labels=cut_bins[1:]) data['x_cut']=data['x_cut'].astype(np.float64) pivot_result=data.pivot_table(index='x_cut', columns='y', values='count', aggfunc='count') pivot_result.rename(columns={0:'good',1:'bad'},inplace=True) pivot_result['num']=pivot_result['good']+pivot_result['bad'] while pivot_result.shape[0]>2: index_list=list(pivot_result.index) for i in range(len(index_list)-1): #计算卡方值 f1=pivot_result.loc[index_list[i],'bad'] f2=pivot_result.loc[index_list[i+1],'bad'] f1_num=pivot_result.loc[index_list[i],'num'] f2_num=pivot_result.loc[index_list[i+1],'num'] ratio=(f1+f2)/(f1_num+f2_num) f1_e=ratio*f1_num f2_e=ratio*f2_num chi_value=(f1-f1_e)**2/f1_e+(f2-f2_e)**2/f2_e #将卡方值填充到后面 pivot_result.loc[index_list[i],'chi_value']=chi_value #终止循环 chi_c = chi2.ppf(confidence, 1) if pivot_result.shape[0]<=max_bins \ and pivot_result['chi_value'].min()>chi_c: break #找到最小的卡方值所在的行号 min_id=pivot_result['chi_value'].idxmin() min_id_next=index_list[index_list.index(min_id)+1] #合并箱体数据 pivot_result.loc[min_id_next,'good']=pivot_result.loc[min_id,'good']+\ pivot_result.loc[min_id_next,'good'] pivot_result.loc[min_id_next,'bad']=pivot_result.loc[min_id,'bad']+\ pivot_result.loc[min_id_next,'bad'] pivot_result.loc[min_id_next,'num']=pivot_result.loc[min_id,'num']+\ pivot_result.loc[min_id_next,'num'] pivot_result.drop(index=[min_id],inplace=True) #提取最优分箱的分位点 pivot_result['bad_rate']=pivot_result['bad']/pivot_result['num'] chi_index_list=list(pivot_result.index) chi_index_list.insert(0,-np.inf) chi_index_list[-1]=np.inf return chi_index_list current_dir=os.getcwd() file_dir=current_dir+'\\建模数据' file_name=file_dir+'\\heart.csv' try: data=pd.read_csv(file_name,encoding='utf-8') except: data=pd.read_csv(file_name,encoding='gbk') print(data) s_data=data.copy() quality_list=['Sex','ChestPainType','FastingBS','RestingECG','ExerciseAngina','ST_Slope'] quantity_list=['Age','RestingBP','Cholesterol','MaxHR','Oldpeak'] #划分数据集 print(data['Sex']) split_point=list(data.index)[math.floor(0.5*data.shape[0])] data.loc[:split_point,'data_set']=1 data.loc[split_point:,'data_set']=2 data1=data.loc[data['data_set']==1,:].copy() data2=data.loc[data['data_set']==2,:].copy() ''' data['HeartDisease']=data['HeartDisease'].replace(0,'good') data['HeartDisease']=data['HeartDisease'].replace(1,'bad') data1_pivot=data1.pivot_table(index='Sex', columns='HeartDisease', values='cust_no', aggfunc='count') data1_pivot['num']=data1_pivot['good']+data1_pivot['bad'] print(data1_pivot) ''' concat_data=pd.concat([data1.groupby('Sex')['HeartDisease'].mean(), data2.groupby('Sex')['HeartDisease'].mean()],axis='columns') concat_data.columns=['rate1','rate2'] concat_data['rate3']=concat_data['rate1']/concat_data['rate2'] concat_data['ln_rate3']=concat_data['rate3'].map(lambda x:math.log(x)) concat_data['psi']=(concat_data['rate1']-concat_data['rate2'])*concat_data['ln_rate3'] psi_value=concat_data['psi'].sum() print(concat_data) print(psi_value) iv_record={'factor_name':[],'iv':[],'type':[]} for i in quality_list: woe_iv=compute_woe(data[i],data['HeartDisease']) #用上了前面的woe,iv的计算 #映射woe data[i]=woe_iv[0] #保存iv iv_record['factor_name'].append(i) iv_record['iv'].append(woe_iv[1]) iv_record['type'].append('quality') #进行连续性变量的卡方分箱 for i in quantity_list: chi_bins=chi_bin(data[i],data['HeartDisease'],5,0.95) print(chi_bins) #用卡方最优分箱进行cut data[i]=pd.cut(data[i],bins=chi_bins,labels=chi_bins[1:]) #计算woe woe_iv=compute_woe(data[i],data['HeartDisease']) #映射woe iv_record['factor_name'].append(i) iv_record['iv'].append(woe_iv[1]) iv_record['type'].append('quantity') iv_record_data=pd.DataFrame(iv_record) iv_record_data.sort_values(by='iv',inplace=True,ascending=False) print(iv_record_data) #相关性 factor_list=list(iv_record_data.loc[iv_record_data['type']=='quantity', 'factor_name']) s_data[factor_list]=s_data[factor_list].astype(np.float64) corr_data=s_data.corr() del_factor=[] print(factor_list) for i in range(len(factor_list)): for j in range(i+1,len(factor_list)): corr_value=corr_data.loc[factor_list[i],factor_list[j]] if abs(corr_value)>0.3: del_factor.append(factor_list[j]) print(del_factor)