Ex1 使用决策树分类方法对鸢尾花数据集进行分类。
Ex2 使用逻辑回归模型对鸢尾花数据集进行分类。
Ex3分别使用准确率(accuracy)、ROC曲线、AUC作为评价指标对比Ex1和Ex2实验结果。
Ex4 实现Kaggle电信用户流失分类
1)样本数据分析
该实例数据同样来自Kaggle,它的每一条数据为一个用户的信息,共有21个有效字段,其中,最后一个字段Churn标志该用户是否流失。运用numpy和pandas等工具对数据进行初步分析,尽量理解特征之间的关系。
- 分析实验要求
- 分析实验数据的完整性、重复性
- 分析实验数据各特征之间的关联关系
2)用户画像
运用各类分析工具对流失用户和非流失用户进行特征分析,说明流失用户和非流失用户的特点。
- 分析各特征与用户是否流失的关系
- 综合上述关系,给出流失用户和非流失用户的特点
3)分类算法建模及分析
运用各类分类算法对数据进行建模,包括:逻辑回归模型、决策树模型、随机森林模型。并用准确率和AUC指标对它们的预测效果进行评估。
- 编码、特征提取
- 数据预处理
- 学习分类算法的应用方法
- 对样本数据进行建模
- 用准确度指标和AUC指标评估模型
- 记录并分析实验结果
- Ex1源代码:
import pandas as pd from sklearn.datasets import load_iris iris = load_iris() iris_df = pd.DataFrame(iris.data, index = iris.target, columns = iris.feature_names) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(iris_df, iris.target, train_size = 0.55) #使用决策树分类方法对鸢尾花数据集进行分类 from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier() tree.fit(X_train, y_train) y_predict = tree.predict(X_test) from sklearn.metrics import classification_report, accuracy_score, confusion_matrix print('决策树Accracy:',accuracy_score(y_predict, y_test)) print('决策树混淆矩阵\n',confusion_matrix(y_test, y_predict)) print('决策树分类性能报告\n',classification_report(y_predict, y_test, target_names=['setosa', 'versicolor', 'virginica']))
Ex2源代码:
import pandas as pd from sklearn.datasets import load_iris iris = load_iris() iris_df = pd.DataFrame(iris.data, index = iris.target, columns = iris.feature_names) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(iris_df, iris.target, train_size = 0.55) #使用逻辑回归模型对鸢尾花数据集进行分类 from sklearn.linear_model import LogisticRegression log = LogisticRegression(solver='newton-cg') log.fit(X_train, y_train) y_predict2 = log.predict(X_test) from sklearn.metrics import classification_report, accuracy_score, confusion_matrix print('逻辑回归Accracy:',accuracy_score(y_predict2, y_test)) print('逻辑回归混淆矩阵\n',confusion_matrix(y_test, y_predict2)) print('逻辑回归分类性能报告\n',classification_report(y_predict2, y_test,target_names=['setosa', 'versicolor', 'virginica']))
Ex3源代码:
def change_one_hot_label(X): # 用one_hot编码转换 T = np.zeros((X.size, 3)) for idx, row in enumerate(T): row[X[idx]] = 1 return T from sklearn import metrics import numpy as np print('决策树Accracy:',accuracy_score(y_predict, y_test)) y_score = tree.predict_proba(X_test) y_one_hot = change_one_hot_label(y_test) fpr, tpr, thresholds = metrics.roc_curve(y_one_hot.ravel(),y_score.ravel()) auc = metrics.auc(fpr, tpr) print('决策树AUC:', auc) print('逻辑回归Accracy:',accuracy_score(y_predict2, y_test)) y_score2 = log.predict_proba(X_test) y_one_hot2 = change_one_hot_label(y_test) fpr2, tpr2, thresholds2 = metrics.roc_curve(y_one_hot2.ravel(),y_score2.ravel()) auc2 = metrics.auc(fpr2, tpr2) print('逻辑回归AUC:', auc2) #画图 import matplotlib.pyplot as plt plt.plot(fpr, tpr, c = 'r', label = u'tree_auc=%.3f' % auc) plt.plot(fpr2, tpr2, c = 'b', label = u'log_auc=%.3f' % auc2) plt.plot((0, 1), (0, 1), c = 'k', ls = '--') plt.xlim((-0.01, 1.01)) plt.ylim((-0.01, 1.01)) plt.xticks(np.arange(0, 1.1, 0.1)) plt.yticks(np.arange(0, 1.1, 0.1)) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.grid(b=True, ls=':') plt.legend(loc='lower right') plt.show()
Ex4源代码:
import numpy as np import pandas as pd df = pd.read_csv('E:\研一\机器学习\telecom_users.csv') df.customerID.duplicated().sum() # 观察是否有重复值 df.nunique() # 观察特征的取值情况 #流失用户与非流失用户的个人信息对比: import matplotlib.pyplot as plt import seaborn as sns fig, axes = plt.subplots(2, 2, figsize=(10, 8)) sns.countplot(x='gender', data=df, hue='Churn', ax=axes[0][0]) sns.countplot(x='SeniorCitizen', data=df, hue='Churn', ax=axes[0][1]) sns.countplot(x='Partner', data=df, hue='Churn', ax=axes[1][0]) sns.countplot(x='Dependents', data=df, hue='Churn', ax=axes[1][1]) #流失用户与非流失用户的服务订阅信息对比 plt.rc('font', family='SimHei') plt.title("在网时长密度图") ax1 = sns.kdeplot(df[df['Churn'] == 'Yes']['tenure'], color='r', linestyle='-', label='Churn:Yes') ax1 =sns.kdeplot(df[df['Churn'] == 'No']['tenure'], color='b', linestyle='--', label='Churn:No') #编码,提取特征 df_clu = df.drop(['Unnamed: 0', 'customerID', 'Churn'], axis=1) labels = df['Churn'] df_clu # 二值对象型特征转换成数值型 df_clu['gender'] = df_clu['gender'].replace('Male', 1).replace('Female', 0) df_clu['Partner'] = df_clu['Partner'].replace('Yes', 1).replace('No', 0) df_clu['Dependents'] = df_clu['Dependents'].replace('Yes', 1).replace('No', 0) df_clu['PhoneService'] = df_clu['PhoneService'].replace('Yes', 1).replace('No', 0) df_clu['PaperlessBilling'] = df_clu['PaperlessBilling'].replace('Yes', 1).replace('No', 0) labels = labels.replace('Yes', 1).replace('No', 0) # 离散的,可用距离度量的对象型特征转化为数值型 df_clu['Contract'] = df_clu['Contract'].replace("Month-to-month", 1).replace("One year", 12).replace("Two year", 24) # 离散的,不宜用距离度量的特征用one-hot编码 df_clu = pd.get_dummies(df_clu) # 归一化 df_clu['tenure'] = ( df_clu['tenure'] - df_clu['tenure'].min() )/( df_clu['tenure'].max() - df_clu['tenure'].min() ) df_clu['Contract'] =( df_clu['Contract'] - df_clu['Contract'].min() )/( df_clu['Contract'].max() - df_clu['Contract'].min() ) df_clu['MonthlyCharges']=(df_clu['MonthlyCharges']-df_clu['MonthlyCharges'].min())/(df_clu['MonthlyCharges'].max()-df_clu['MonthlyCharges'].min() ) df_clu['TotalCharges']=(df_clu['TotalCharges']-df_clu['TotalCharges'].min() )/( df_clu['TotalCharges'].max() - df_clu['TotalCharges'].min() ) df_clu from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, roc_auc_score X_train, X_test, y_train, y_test = train_test_split(df_clu, labels, test_size=0.3, random_state = 1026) # 逻辑回归模型 from sklearn.linear_model import LogisticRegression log = LogisticRegression() log.fit(X_train, y_train) y_pred1 = log.predict(X_test) print('逻辑回归的精确度为:', accuracy_score(y_test, y_pred1)) print('逻辑回归的auc为:', roc_auc_score(y_test, y_pred1, average='micro')) print('分类报告:\n', classification_report(y_test, y_pred1)) # 决策树模型 from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier() tree.fit(X_train, y_train) y_pred2 = tree.predict(X_test) print('决策树的精确度为:', accuracy_score(y_test, y_pred2)) print('决策树的auc为:', roc_auc_score(y_test, y_pred2, average='micro')) print('分类报告:\n', classification_report(y_test, y_pred2)) # 随机森林模型 from sklearn.ensemble import RandomForestClassifier forests = RandomForestClassifier() forests.fit(X_train, y_train) y_pred3 = forests.predict(X_test) print('随机森林的精确度为:', accuracy_score(y_test, y_pred3)) print('随机森林的auc为:', roc_auc_score(y_test, y_pred3, average='micro')) print('分类报告:\n', classification_report(y_test, y_pred3))
总结:
在本次分类实验中,学会运用逻辑回归模型、决策树模型、随机森林模型等各类分类算法对数据进行建模,并学习了编码、特征提取、数据预处理和用准确度指标和AUC指标评估模型等过程。