建模评估
监督学习
分类评估
特点:离散值
混淆矩阵:
理想曲线向左上方偏斜
代码实现
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score,confusion_matrix
ran_clf = RandomForestClassifier(max_features='auto', n_estimators=100).fit(X_validation, Y_validation)
print("\ntest:\n")
Y_pred = ran_clf.predict(X_test)
print("ACC", accuracy_score(Y_test, Y_pred)) #准确率
print("REC", recall_score(Y_test, Y_pred)) #召回率
print("F-score", f1_score(Y_test, Y_pred)) #F1-score
confusion_matrix(Y_test, Y_pred) #绘制混淆矩阵
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,auc
import seaborn as sns
from matplotlib.font_manager import FontProperties #显示中文,并指定字体
myfont=FontProperties(fname=r'C:/Windows/Fonts/simhei.ttf',size=14)
sns.set(font=myfont.get_name(), style="white")
plt.rcParams['axes.unicode_minus']=False #显示负号
ran_clf = RandomForestClassifier(max_features='auto', n_estimators=100).fit(X_validation, Y_validation)
Y_pred = ran_clf.predict(X_test)
f = plt.figure()
fpr,tpr,threshold = roc_curve(Y_test, Y_pred)
plt.plot(fpr, tpr) #绘制ROC曲线
plt.xticks(fontsize=16) #设置x轴刻度值的字体大小
plt.yticks(fontsize=16) #设置y轴刻度值的字体大小
plt.xlabel('')
plt.ylim(0, 1)
plt.title('ROC曲线', fontsize=20)
plt.show()
print('AUC:', auc(fpr,tpr)) #输出AUC
回归评估
特点:连续值
代码实现
from sklearn.model_selection import train_test_split
X = features[['number_project', 'average_monthly_hours']]
Y = features['last_evaluation']
f_v = X.values #提取X的values
l_v = Y.values #提取Y的values
X_tt, X_test, Y_tt, Y_test = train_test_split(f_v, l_v, test_size=0.2) #设定测试集为2/10
X_train, X_validation, Y_train, Y_validation = train_test_split(X_tt, Y_tt, test_size=0.25) #设定训练集:验证集 = 3:1
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
regr = Ridge(alpha=1).fit(X_train, Y_train) #在训练集上训练
Y_pred = regr.predict(X_test)
print("-coef:", regr.coef_)
print("-MAE:", mean_absolute_error(Y_test, Y_pred))
print("-MSE:", mean_squared_error(Y_test, Y_pred))
print("-R2:", r2_score(Y_test, Y_pred))
非监督学习
非监督评估
代码实现
import numpy as np
from sklearn.datasets import make_circles,make_blobs,make_moons
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
n_samples = 1000
circles = make_circles(n_samples=n_samples, factor=0.5, noise=0.05) #以下四行为四个数据集
moons = make_moons(n_samples=n_samples, noise=0.05)
blobs = make_blobs(n_samples=n_samples, random_state=8)
random_data = np.random.rand(n_samples, 2), None
colors = 'bgrcmyk' #绘图时根据聚类的标注选择颜色
data = [circles, moons, blobs, random_data]
models = [('None', None), ('KMeans', KMeans(n_clusters=3))]
from sklearn.metrics import silhouette_score
f = plt.figure()
for inx,clt in enumerate(models):
clt_name, clt_entity = clt
for i,dataset in enumerate(data):
X,Y = dataset
if not clt_entity:
clt_res = [0 for item in range(len(X))]
else:
clt_entity.fit(X)
clt_res = clt_entity.labels_.astype(np.int) #聚类的标注
f.add_subplot(len(models), len(data), inx*len(data)+i+1) #子图位置摆放
plt.title(clt_name)
try:
print('第二行图-',i+1, '轮廓系数: ',silhouette_score(X, clt_res)) #计算轮廓系数
except:
pass
[plt.scatter(X[p, 0], X[p, 1], color=colors[clt_res[p]]) for p in range(len(X))] #画出聚类后的点,并以颜色来分布
plt.tight_layout()
plt.show()