数据来源
是在kaggle上下载的,或者来自某个社区的数据集,点赞后私信我拿数据集哦~
全流程
1,导入包
import warnings
from pandas_profiling import ProfileReport
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
warnings.filterwarnings("ignore")
%matplotlib inline
2,导入数据并查看前五个数据
df=pd.read_csv(r'C:\Users\hp\Desktop\python\kaggle\train_set.csv')
tf=pd.read_csv(r'C:\Users\hp\Desktop\python\kaggle\test_set.csv')
df.head()
3,据常识判断,利用df.drop()函数将无用变量删掉
df=df.drop(['ID','marital','month'],axis=1)
tf=tf.drop(['ID','marital','month'],axis=1)
df.head()
4,查看数据整体信息
df.info()
可以看到,整个训练集有25317个数据,并且无缺失值,因此省去了处理缺失值的步骤
5,为方便处理,将文字信息且类别多的信息删掉。正常情况下,可采用独热编码或者二进制编码加以处理
df=df.drop(['job','education'],axis=1)
tf=tf.drop(['job','education'],axis=1)
6,查看剩余信息中的类别特征(图略)
Default_no=df.y[df.default=='no'].value_counts()
Default_yes=df.y[df.default=='yes'].value_counts()
dff=pd.DataFrame({'de':Default_yes,'notde':Default_no})
u=df.y[df.contact=='unknown'].value_counts()
c=df.y[df.contact=='cellular'].value_counts()
t=df.y[df.contact=='telephone'].value_counts()
dfff=pd.DataFrame({'u':u,'t':t,'c':c})
ln=df.y[df.loan=='no'].value_counts()
ly=df.y[df.loan=='yes'].value_counts()
dffff=pd.DataFrame({'yes':ly,'no':ln})
dff.plot(kind='bar',stacked=True)
dfff.plot(kind='bar',stacked=True)
dffff.plot(kind='bar',stacked=True)
plt.show()
7,将default,contact,loan特征进行编码
def binaryFeature(data):
data['default_']=0
data['default_'][data['default']=='yes'] = 1
data['housing_']=0
data['housing_'][data['housing']=='yes'] = 1
data['loan_']=0
data['loan_'][data['loan']=='yes'] = 1
return data.drop(['default','housing','loan'], axis=1)
df= binaryFeature(df)
tf= binaryFeature(tf)
df.info()
8,查看poutcome这个特征与y的关系,会发现几乎不影响y,故删去
u=df.y[df.poutcome=='unknown'].value_counts()
f=df.y[df.poutcome=='failure'].value_counts()
o=df.y[df.poutcome=='other'].value_counts()
s=df.y[df.poutcome=='success'].value_counts()
dfffff=pd.DataFrame({'u':u,'f':f,'c':c,'s':s})
dfffff.plot(kind='bar',stacked=True)
plt.show()
df=df.drop(['poutcome'],axis=1)
tf=tf.drop(['poutcome'],axis=1)
9,将y特征处理到最后一列,便于后面的预测分类
df_y=df.y
df=df.drop('y',axis=1)
df.insert(10,'y',df_y)
df.info()
10,热力图
f, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(df.corr(), annot=True, cmap="Blues", fmt='.0f',
ax=ax, linewidths=5, cbar=False,
annot_kws={"size": 16})
plt.xticks(size=18)
plt.yticks(size=12, rotation=0)
plt.ylabel("Variables")
plt.title("Descriptive Statistics", size=16)
plt.show()
11,将训练集二八分,并设置random_state,保证每次模型的相同,生成的数据集相同,以及拆分的结果相同
X = df.iloc[:, 0:10]
y= df['y']
random_state=42
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=random_state)
12,使用scaler归一化(不需要归一化的机器学习模型很少,我知道的只有决策树,随机森林和朴素贝叶斯),尝试使用lr回归
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
#print(cm)
df1 = pd.DataFrame(columns=["0", "1"],index=["0", "1"], data=cm)
f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(df1, annot=True, cmap="Greens", fmt='.0f',
ax=ax, linewidths=5, cbar=False, annot_kws={"size": 16})
plt.xlabel("Predicted Label")
plt.xticks(size=12)
plt.yticks(size=12, rotation=0)
plt.ylabel("True Label")
plt.title("Confusion Matrix", size=12)
plt.show()
预测结果如图,正确率达到了0.89(auc)
13,使用lda降维的lr预测(本项目为有监督学习,适合lda,而非pca)
lda = LDA(n_components=1)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)
classifier_lda = LogisticRegression(random_state=random_state)
classifier_lda.fit(X_train_lda, y_train)
y_pred_lda = classifier_lda.predict(X_test_lda)
print(classification_report(y_test, y_pred_lda))
cm = confusion_matrix(y_test, y_pred_lda)
df1 = pd.DataFrame(columns=["0", "1"],
index=["0", "1"], data=cm)
f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(df1, annot=True, cmap="Greens", fmt='.0f', ax=ax,
linewidths=5, cbar=False, annot_kws={"size": 16})
plt.xlabel("Predicted Label")
plt.xticks(size=12)
plt.yticks(size=12, rotation=0)
plt.ylabel("True Label")
plt.title("Confusion Matrix", size=12)
plt.show()
正确率依然达到了0.89
14,使用其它模型并加上参数,进行预测,由于时间紧迫,折数仅设置成2,再进行网格化搜索
classifier = [DecisionTreeClassifier(random_state=random_state),
#SVC(random_state=random_state, probability=True),
RandomForestClassifier(random_state=random_state),
LogisticRegression(random_state=random_state),
KNeighborsClassifier(),
SGDClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state),
LGBMClassifier(random_state=random_state)]
dt_param_grid = {"min_samples_split": range(10, 500, 20),
"max_depth": range(1, 20, 2)}
#svc_param_grid = {"kernel": ["rbf"],
#"gamma": [0.001, 0.01, 0.1, 1],
#"C": [0.1, 1, 10, 50, 100, 200, 300, 1000]}
rf_param_grid = {"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [False],
"n_estimators": [100, 300],
"criterion": ["gini"]}
logreg_param_grid = {"C": np.logspace(-4, 4, 20),
"penalty": ["l1", "l2", "none"]}
knn_param_grid = {"n_neighbors": np.linspace(2, 20, 12, dtype=int).tolist(),
"weights": ["uniform", "distance"],
"metric": ["euclidean", "manhattan", "minkowski"],
"leaf_size": [1, 3, 5, 12, 30]}
sgdc_param_grid = {
"loss": ["hinge", "log", "squared_hinge", "modified_huber"],
"alpha": [0.0001, 0.001, 0.01, 0.1],
"penalty": ["l2", "l1", "none"]}
gbc_param_grid = {
"learning_rate": [0.05, 0.1, 0.2],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10]
}
lgbmc_param_grid = {
'num_leaves': [31, 127],
'reg_alpha': [0.1, 0.5],
'min_data_in_leaf': [30, 50, 100, 300],
'lambda_l1': [0, 1, 1.5],
'lambda_l2': [0, 1]}
classifier_param = [dt_param_grid,
#svc_param_grid,
rf_param_grid,
logreg_param_grid,
knn_param_grid,
sgdc_param_grid,
gbc_param_grid,
lgbmc_param_grid]
cv_result = []
best_estimators = []
mean_squared_errors = []
roc_auc_scores = []
recall_scores = []
precision_scores = []
f1_scores = []
for i in range(len(classifier)):
print("---------------------------------------------------------------------------")
clf = GridSearchCV(classifier[i],
param_grid=classifier_param[i],
cv=StratifiedKFold(n_splits=2),
scoring="accuracy",
n_jobs=1, verbose=2)
clf.fit(X_train, y_train)
cv_result.append(clf.best_score_)
mean_squared_errors.append(mean_squared_error(y_test, clf.predict(X_test)))
#roc_auc_scores.append(roc_auc_score(
#y_test, clf.predict_proba(X_test)))
recall_scores.append(recall_score(
y_test, clf.predict(X_test), average='weighted'))
precision_scores.append(precision_score(
y_test, clf.predict(X_test), average='weighted'))
f1_scores.append(f1_score(y_test, clf.predict(X_test), average='weighted'))
best_estimators.append(clf.best_estimator_)
print("Model: {}".format(classifier[i]))
print("Accuracy: %{}".format(round(cv_result[i]*100, 2)))
print("MSE: {}".format(mean_squared_errors[i]))
#print("ROC AUC: {}".format(roc_auc_scores[i]))
print("Recall: {}".format(recall_scores[i]))
print("Precision: {}".format(precision_scores[i]))
print("F1-Score: {}".format(f1_scores[i]))
print("Best Estimator: {}".format(clf.best_estimator_))
print("---------------------------------------------------------------------------")
sns.set_style("darkgrid")
cv_results = pd.DataFrame({"Accuracy": cv_result,
"MSE": mean_squared_errors,
#"ROC AUC": roc_auc_scores,
"Recall": recall_scores,
"Precision": precision_scores,
"F1-Score": f1_scores,
"Models": ["DecisionTreeClassifier",
#"SVC",
"RandomForestClassifier",
"LogisticRegression",
"KNeighborsClassifier",
"SGDClassifier",
"GBClassifier",
"LGBMClassifier"]})
cv_results.index = cv_results["Models"]
cv_results = cv_results.drop(["Models"], axis=1)
f, ax = plt.subplots(figsize=(14, 10))
sns.heatmap(cv_results, annot=True, cmap="Blues", fmt='.3f',
ax=ax, linewidths=5, cbar=False,
annot_kws={"size": 18})
plt.xticks(size=18)
plt.yticks(size=18, rotation=0)
plt.ylabel("Models")
plt.title("Grid Search Results", size=16)
plt.show()
结果如下图
cv_results = pd.DataFrame({"Cross Validation Means": cv_result,
"Models": ["DecisionTreeClassifier",
#"SVC",
"RandomForestClassifier",
"LogisticRegression",
"KNeighborsClassifier",
"SGDClassifier",
"GBClassifier",
"LGBMClassifier"]})
plt.figure(figsize=(10, 6))
sns.barplot("Cross Validation Means", "Models",
data=cv_results, palette="Set1")
plt.xlabel("Mean Accuracy",
size=12)
plt.yticks(size=14)
plt.title("Cross Validation Scores",
size=12)
plt.show()
16,使用votingclassifier进行集成学习
votingC = VotingClassifier(estimators=[("rfc", best_estimators[1]),
("gbc", best_estimators[5]),
("lgbm", best_estimators[6])],voting='soft')
votingC = votingC.fit(X_train, y_train)
voting_pred = votingC.predict(X_test)
#print(voting_pred)
print(classification_report(y_test, voting_pred))
cm = confusion_matrix(y_test, voting_pred)
df1 = pd.DataFrame(columns=["0", "1"],
index=["0", "1"], data=cm)
f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(df1, annot=True, cmap="Greens", fmt='.0f',
ax=ax, linewidths=5, cbar=False, annot_kws={"size": 16})
plt.xlabel("Predicted Label")
plt.xticks(size=12)
plt.yticks(size=12, rotation=0)
plt.ylabel("True Label")
plt.title("Confusion Matrix", size=12)
plt.show()