1. svm
from sklearn import svm
clf=svm.SVC(gamma=0.0001, C=100, kernel='linear') #gamma指的是学习率
clf.coef_:线性分类器的系数,shape = [n_class-1, n_features]
clf.intercept_:分类器的常量
w=clf.coef_通过这些系数变化,可求出线性分类器的相关参数
2. kmeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
centroids = kmeans.cluster_center_ #centroids是一个坐标数组shape=(n_clusters, X.shape[1])
labels = kmeans.labels_ #labels是一个一维数组,shape=(X.shape[0]),范围0~n_clusters-1
colors=["g.","r."]
for i in range(len(X)):
plt.plot(X[i][0], X[i][1], colors[labels[i]],) #label数组的值为0~n_cluster-1
plt.scatter(centroids[:, 0], centroids[:,1] marker="x")
plt.show()
3. MeanShift
import numpy as np
from sklearn.cluster import MeanShift as ms
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
centers = [[1,1],[5,5]] #实际中心点
X, _ = make_blobs(n_samples=200, centers = centers, cluster_std=1) #下划线表示没用的,可忽略的, _是实际的label
plt.scatter(X[:, 0], X[:, 1])
ms=MeanShift()
ms.fit(X)
labels = ms.labels_ #预测的label
cluster_centers = ms.cluster_centers_ #获取预测中心点
n_clusters_ = len(np.unique(labels)) #获取cluster数量
sklearn需要ndarray类型的数据,但是也能识别list数据
4. knn
knn=sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn.predict(X_new)
5. 评价函数
from sklearn import metrics
print metrics.accuracy_score(y, y_pred)
6. 将训练数据进行cross_validation
from sklearn.cross_validation import train_test_split
#训练数据取出0.4作为测试集,随机分配,每次运行结果都不同,还可以给出random_state参数,这样每次分割的都是一样的
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
train_test_split(X, y, test_size=0.4, random_state=4)
通常的步骤是:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4)
logreg = logisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
7. 使用LinearRegression获取系数和常量
- logreg.intercept_ 常量
- logreg.coef_ 系数
8. 回归问题的误差度量:
from sklearn import metrics
#MAE度量
print(metrics.mean_absolute_error(y, y_perd)) #计算预测值和实际值之差的绝对值之和
#MSE度量
print(metrics.mean_absolute_error(y, y_perd)) #计算预测值和实际值之差的平方之和
#RMSE度量 root MSE
np.sqrt(metrics.mean_absolute_error(y, y_perd)) #上式开方即可
9. 交叉验证
from sklearn.cross_validation import KFold
kf = KFold(25, n_fold=5, shuffle=False) #25条数据,做5倍交叉验证
for iteration, data, in enumerate(kf, start=1):
print(iteration, data[0], data[1]) #data[0]表示训练集,data[1]表示测试集
通常k=10最佳,建议使用分层抽样,让每类样本被抽样到的概率相等.
10. 分层抽样
使用sklearn.cross_val_score实现分层抽样
from sklearn.cross_validation import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') #knn算法,k=5, 10倍交叉验证,cv(cross validation)=10
print(scores) #返回一个k维数组
print(scores.mean()) #打印平均值
对于回归问题,scoring=‘mean_squared_error’
11. 使用CV搜索参数
from sklearn.grid_search import GridSearchCV
#定义参数搜索区间
k_range=range(1, 31)
创建参数grid
param_grid=dict(n_neighbors=k_range)
=> param: {'n_neighbors': [1,2,3...,30]}
#初始化一个grid,因此后面会重复len(param_grid)次
grid=GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
grid.grid_scores_ #打印全部结果,包括[[mean: , std: params:{'neighbors':1} .....]] std太大不可靠
打印参数
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
plt.plot(k_range, grid_mean_scores)
print(grid.best_score)
print(grid.best_params_)
print(grid.best_estimator_)
12. 搜索多个参数
k_range = range(1,31)
weight_options=['uniform', 'distance']
param_grid=dict(neighbors=k_range, weight=weight_options)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='arruracy')
grid.fit(X, y)
grid.grid_scores_
print(grid.grid_scores_)
print(grid.best_params_)
然后可以直接使用最佳参数来预测
y_pred = grid.predict(X_new)
13. 随机搜索
from sklearn.grid_search import RandomizedSearchCV
param_dist = dict()
rand = RandomizedSearchCV(knn, param_dict, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_
- 分类模型的评价
- 回归: MAE, MSE, RMSE
- 分类,AUC,AOC
14. 打印confusion_matrix:
print(metrics.confusion_matrix(y_test, y_pred)) #通常将actual放前,predict放后面
一般的,在实际的值当中,0被当做positive,1被当做negative,预测正确为True,预测错误为False
confusion_matrix=metrics.confusion_matrix(y_test, y_pred)
TP = confusion_matrix[1,1]
TN = confusion_matrix[0,0]
FP = confusion_matrix[0,1]
FN = confusion_matrix[1,0]
计算recall, precision
recall = metrics.recall_score(y_test, y_pred)
precisin = metrics.precision_score(y_test, y_pred)
15.clf.predict_proba函数
使用clf.predict_proba(X_test)来求属于每一个类别的概率(predict probability)
print(clf.predict_proba(X_test)) #打印一个概率矩阵
16.调整概率阈值
from sklearn.preprocessing import binarize
y_pred = binarize(y_pre_brob, 0.3)[0] #将概率矩阵传进去,然后赋值threshold
#打印新的confusion_matrix
print(metrics.confusion_matrix(y_test, y_pred))
17. ROC曲线和ACU曲线
ROC可以观察sensitivy和specificity受threshold的变化情况
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr) # False Positive Rate/True Positive Rate
# 使用AUC作为评测标准
metrics.roc_auc_score(y_test, y_pred_prob)
18. 使用AUC作为交叉验证的指标
from sklearn.cross_validation import cross_val_score
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc')
19.将所有数据分成train和test两部分
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
20.使用LR来训练,并得到测试集的y的值
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)
21.使用accuracy来评价训练结果
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class)
22.使用series.value_counts()统计真实值的label分布情况,使用series.mean() 计算均值
23.使用y_test.mean()计算1的比例,使用1-y_test.mean()计算0的比例
24.使用confusion_matrix来进行评价
from sklearn import metrics
metrics.confusion_matrix(y_test, y_pred_class)
# TN:Truly predicted that it's Negative values(actual 0, predict 0)
# TP:Truly predicted that it's Positive values(actual 1, predict 1)
# FN:Falsely predicted that it's Negative values(actual 1, predict 0)
# FP:Falsely predicted that it's Positive values(actual 0, predict 1)
- TP=confusion[1,1]
- TN=confusion[0,0]
- FP=confusion[0,1]
- FN=confusion[1,0]
25.accuray_score=(TN+TP)/(TN+TP+FN+FP)
26.Classification Error(Misclassfication Rate):为1-metrics_accuracy_score(y_test, y_pred),即 (FP+FN)/(TP+TN+FP+FN)。
27.Sensitivity(Recall, True Positive Rate):当实际值为Positive(1)的时候,判断预测正确的比率
sensitivity = TP/(TP+FN)
metrics.recall_score(y_test, y_pred)
28.Specificity:当实际值为Negative(0)的时候,判断预测正确的比率
specificity = TN/(TN+FP)
29.False Positive Rate:
当实际值是Negative(0)的时候,预测错误的比率
FP/(TN+FP)
30.Precision:
当预测值为Positive(1)时,预测正确的比率
precision = TP/(TP+FP)
metrics.precision_score(y_test, y_pred)
31.调整预测的概率阈值
predict_proba会返回一个shape为(n,2)的矩阵,第一列是预测为0的概率,第二列是预测为1的概率
from sklearn.preprocessing import binarize
binarize(matrix, threshold=0.3) #返回一个0/1的matrix,将matrix中大于0.3的值记为1,小于0.3的记为0
#binarize默认以0.5作为阈值
32.ROC
受试者工作特征曲线 (receiver operating characteristic curve,简称ROC曲线),可用来查看sensitivity和specificity随不同阈值的变化,而不用时刻改变阈值
y_pred_prob = metrics.predict_prob(X_test)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title("ROC curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.grid(True)
plt.show()
但是阈值无法从ROC中看出,因此需要自己计算
def evaluate_threshold(threshold):
print("sensitivity:" tpr[threshlds > threshold][-1])
print("specificity:" 1-fpr(thresholds > threshold][-1])
evaluate_threshold(0.5)
evaluate_threshold(0.3)
通过这种方法来寻找最佳阈值,最佳阈值是使得ROC的横坐标尽可能小,纵坐标尽可能大
33.AUC(Area under the roc Curve)
print(metrics.roc_auc_score(y_test, y_pred_prob))