Scikit-Learn学习笔记

1. svm

from sklearn import svm
clf=svm.SVC(gamma=0.0001, C=100, kernel='linear') #gamma指的是学习率

clf.coef_:线性分类器的系数,shape = [n_class-1, n_features]
clf.intercept_:分类器的常量

w=clf.coef_通过这些系数变化,可求出线性分类器的相关参数

2. kmeans

kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

centroids = kmeans.cluster_center_  #centroids是一个坐标数组shape=(n_clusters, X.shape[1])
labels = kmeans.labels_ #labels是一个一维数组,shape=(X.shape[0]),范围0~n_clusters-1

colors=["g.","r."]
for i in range(len(X)):
plt.plot(X[i][0], X[i][1], colors[labels[i]],) #label数组的值为0~n_cluster-1

plt.scatter(centroids[:, 0], centroids[:,1] marker="x")
plt.show()

3. MeanShift

import numpy as np
from sklearn.cluster import MeanShift as ms
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt

centers = [[1,1],[5,5]] #实际中心点
X, _ = make_blobs(n_samples=200, centers = centers, cluster_std=1) #下划线表示没用的,可忽略的, _是实际的label
plt.scatter(X[:, 0], X[:, 1])
ms=MeanShift()
ms.fit(X)
labels = ms.labels_ #预测的label
cluster_centers = ms.cluster_centers_ #获取预测中心点
n_clusters_ = len(np.unique(labels)) #获取cluster数量

sklearn需要ndarray类型的数据,但是也能识别list数据

4. knn

knn=sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn.predict(X_new)

5. 评价函数

from sklearn import metrics
print metrics.accuracy_score(y, y_pred)

6. 将训练数据进行cross_validation

from sklearn.cross_validation import train_test_split

#训练数据取出0.4作为测试集,随机分配,每次运行结果都不同,还可以给出random_state参数,这样每次分割的都是一样的
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
train_test_split(X, y, test_size=0.4, random_state=4)

通常的步骤是:

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4)
logreg = logisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

7. 使用LinearRegression获取系数和常量

  • logreg.intercept_ 常量
  • logreg.coef_ 系数

8. 回归问题的误差度量:

from sklearn import metrics
#MAE度量
print(metrics.mean_absolute_error(y, y_perd)) #计算预测值和实际值之差的绝对值之和
#MSE度量
print(metrics.mean_absolute_error(y, y_perd)) #计算预测值和实际值之差的平方之和
#RMSE度量 root MSE
np.sqrt(metrics.mean_absolute_error(y, y_perd)) #上式开方即可

9. 交叉验证

from sklearn.cross_validation import KFold
kf = KFold(25, n_fold=5, shuffle=False) #25条数据,做5倍交叉验证
for iteration, data, in enumerate(kf, start=1):
print(iteration, data[0], data[1]) #data[0]表示训练集,data[1]表示测试集

通常k=10最佳,建议使用分层抽样,让每类样本被抽样到的概率相等.

10. 分层抽样

使用sklearn.cross_val_score实现分层抽样

from sklearn.cross_validation import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')  #knn算法,k=5, 10倍交叉验证,cv(cross validation)=10
print(scores) #返回一个k维数组
print(scores.mean()) #打印平均值

对于回归问题,scoring=‘mean_squared_error’

11. 使用CV搜索参数

from sklearn.grid_search import GridSearchCV
#定义参数搜索区间
k_range=range(1, 31)
创建参数grid
param_grid=dict(n_neighbors=k_range)
=> param: {'n_neighbors': [1,2,3...,30]}
#初始化一个grid,因此后面会重复len(param_grid)次
grid=GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

grid.fit(X, y)
grid.grid_scores_ #打印全部结果,包括[[mean: , std: params:{'neighbors':1} .....]] std太大不可靠

打印参数
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)

grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]

plt.plot(k_range, grid_mean_scores)

print(grid.best_score)
print(grid.best_params_)
print(grid.best_estimator_)

12. 搜索多个参数

k_range = range(1,31)
weight_options=['uniform', 'distance']

param_grid=dict(neighbors=k_range, weight=weight_options)

grid = GridSearchCV(knn, param_grid, cv=10, scoring='arruracy')

grid.fit(X, y)

grid.grid_scores_

print(grid.grid_scores_)
print(grid.best_params_)

然后可以直接使用最佳参数来预测

y_pred = grid.predict(X_new)

13. 随机搜索

from sklearn.grid_search import RandomizedSearchCV
param_dist = dict()
rand = RandomizedSearchCV(knn, param_dict, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_
  1. 分类模型的评价
  • 回归: MAE, MSE, RMSE
  • 分类,AUC,AOC

14. 打印confusion_matrix:

print(metrics.confusion_matrix(y_test, y_pred)) #通常将actual放前,predict放后面
一般的,在实际的值当中,0被当做positive,1被当做negative,预测正确为True,预测错误为False
confusion_matrix=metrics.confusion_matrix(y_test, y_pred)
TP = confusion_matrix[1,1]
TN = confusion_matrix[0,0]
FP = confusion_matrix[0,1]
FN = confusion_matrix[1,0]

计算recall, precision

recall = metrics.recall_score(y_test, y_pred)
precisin = metrics.precision_score(y_test, y_pred)

15.clf.predict_proba函数

使用clf.predict_proba(X_test)来求属于每一个类别的概率(predict probability)

print(clf.predict_proba(X_test)) #打印一个概率矩阵

16.调整概率阈值

from sklearn.preprocessing import binarize
y_pred = binarize(y_pre_brob, 0.3)[0] #将概率矩阵传进去,然后赋值threshold
#打印新的confusion_matrix
print(metrics.confusion_matrix(y_test, y_pred))

17. ROC曲线和ACU曲线

ROC可以观察sensitivy和specificity受threshold的变化情况

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr) # False Positive Rate/True Positive Rate
# 使用AUC作为评测标准
metrics.roc_auc_score(y_test, y_pred_prob)

18. 使用AUC作为交叉验证的指标

from sklearn.cross_validation import cross_val_score
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc'

19.将所有数据分成train和test两部分

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

20.使用LR来训练,并得到测试集的y的值

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred_class = logreg.predict(X_test)

21.使用accuracy来评价训练结果

from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class)

22.使用series.value_counts()统计真实值的label分布情况,使用series.mean() 计算均值

23.使用y_test.mean()计算1的比例,使用1-y_test.mean()计算0的比例

24.使用confusion_matrix来进行评价

from sklearn import metrics
metrics.confusion_matrix(y_test, y_pred_class)

# TN:Truly predicted that it's Negative values(actual 0, predict 0)
# TP:Truly predicted that it's Positive values(actual 1, predict 1)
# FN:Falsely predicted that it's Negative values(actual 1, predict 0)
# FP:Falsely predicted that it's Positive values(actual 0, predict 1)
  • TP=confusion[1,1]
  • TN=confusion[0,0]
  • FP=confusion[0,1]
  • FN=confusion[1,0]

25.accuray_score=(TN+TP)/(TN+TP+FN+FP)

26.Classification Error(Misclassfication Rate):为1-metrics_accuracy_score(y_test, y_pred),即 (FP+FN)/(TP+TN+FP+FN)。

27.Sensitivity(Recall, True Positive Rate):当实际值为Positive(1)的时候,判断预测正确的比率

sensitivity = TP/(TP+FN)
metrics.recall_score(y_test, y_pred)

28.Specificity:当实际值为Negative(0)的时候,判断预测正确的比率

specificity = TN/(TN+FP)

29.False Positive Rate:

当实际值是Negative(0)的时候,预测错误的比率
FP/(TN+FP)

30.Precision:

当预测值为Positive(1)时,预测正确的比率

precision = TP/(TP+FP)
metrics.precision_score(y_test, y_pred)

31.调整预测的概率阈值

predict_proba会返回一个shape为(n,2)的矩阵,第一列是预测为0的概率,第二列是预测为1的概率

from sklearn.preprocessing import binarize
binarize(matrix, threshold=0.3) #返回一个0/1的matrix,将matrix中大于0.3的值记为1,小于0.3的记为0
#binarize默认以0.5作为阈值

32.ROC

受试者工作特征曲线 (receiver operating characteristic curve,简称ROC曲线),可用来查看sensitivity和specificity随不同阈值的变化,而不用时刻改变阈值

y_pred_prob = metrics.predict_prob(X_test)[:,1]

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title("ROC curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.grid(True)
plt.show()

但是阈值无法从ROC中看出,因此需要自己计算

def evaluate_threshold(threshold):
	print("sensitivity:" tpr[threshlds > threshold][-1])
	print("specificity:" 1-fpr(thresholds > threshold][-1])

evaluate_threshold(0.5)
evaluate_threshold(0.3)

通过这种方法来寻找最佳阈值,最佳阈值是使得ROC的横坐标尽可能小,纵坐标尽可能大

33.AUC(Area under the roc Curve)

print(metrics.roc_auc_score(y_test, y_pred_prob))
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值