(1) sklearn.preprocessing
from sklearn import preprocessing
import numpy as np
#创建一组特征数据,每一行表示一个对象,每一列表示一个特征
X = np.array([[1.,-1.,2.]
[2.,0.,0.]
[0.,1.,-1.]])
#将每一列标准化成标准正态分布, 注意标准化是针对列
X_scale = preprocessing.scale(X)
print X_scale
# 输出
array([[ 0, -1.22474487, 1.33630621],
[ 1.22474487, 0, -0.26726124],
[-1.22474487, 1.22474487, -1.06904497]])
(2) sklearn.externals.joblib
跟 cPickle 一样,只不过只能用于保存模型到文件,并且适合更大的对象的保存。
from sklearn.externals import joblib
#保存对象 clf
joblib.dump(clf,"filename.pkl")
#加载保存的模型
clf = joblib.load("filename.pkl")
(3) sklearn.svm
传统机器学习 svm 算法,关于原理可见其他博客
from sklearn import svm
X = [[0,0],[1,1],[1,0]] # 训练样本
y = [0,1,1] # 样本标记数据
clf = svm.SVC()
clf.fit(X,y) # 训练模型
result = clf.predict([[2,2]]) # 使用训练的模型预测数据分类,输出 [1]
print clf.support_vectors_ # 输出支持向量 输出[[0. 0.] [1. 1.] [1. 0.]]
print clf.support_ # 输出支持向量的索引 输出 [0 1 2]
print clf.n_support_ # 输出支持向量在每个类的数量,输出 [1 2]
print clf.decision_function([[2,2]]) #计算样本点到分割超平面的函数距离 输出 [0.80393476]
(4) sklearn.neighbors
最邻近搜索,里面有 KDTree , BallTree,KNeighborsClassifier 等等跟邻域计算有关的算法,这里不介绍算法原理
# KNeighborsClassifier --knn 算法
from sklearn.neighbors import KNeighborsClassifier
X = [[0],[1],[2],[3],[4],[5],[6],[7],[8]]
y = [0,0,0,1,1,1,2,2,2]
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(X,y)
print neigh.predict([[1.1]]) #预测
#KDTree
from sklearn.neighbors import KDTree
X = [[0,1],[1,0],[1,1]]
kdt = KDTree(numpy.array(X,dtype = float),leaf_size = 10,metric = "minkowski")
y = [0,0]
# 搜索 y 的 2 个最近邻域
[distances],[points] = kdt.query(numpy.array([y]),k = 2,return_distance = True)
for x,y in zip(points,distances):
print x ,y # points = [1 0] 中是节点位置的索引, distances = [1. 1.] 是距离
(5) sklearn.cross_validation 交叉验证
#将文本 X 和标签 y 进行快速打乱,并生成训练集 和 测试集,test_size 就是测试集的比例
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
#生成 n 份测试、训练集, n_folds = n,并按 n 折划分
from sklearn.cross_validation import KFold
kf = KFold(10,n_folds = 3)
for train_index,test_index in kf:
print train_index,test_index
'''输出 [4 5 6 7 8 9] [0 1 2 3]
[0 1 2 3 7 8 9] [4 5 6]
[0 1 2 3 4 5 6] [7 8 9] '''
# StratifiedKFold() 比较常用,比 KFold 的优势在于将 K 折数据按照百分比划分
# 训练集和测试集时,每个类别百分比在训练集和测试集中都一样,因此它需要传入具体数据集,而不是数据集长度
from sklearn.cross_validation import StratifiedKFold
X = [0,0,0,0,1,1,1,1,1,1]
skf = StratifiedKFold(X,n_folds = 3)
for train_index ,test_index in skf:
print train_index,test_index
'''输出 [2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9] '''
(6) sklearn.metrics 模型评估
# classification_report 分类效果评估
# classification_report(y_true,y_pred,labels = None,target_names = None,\
# sample_weight = None,digits = 2)
# 参数说明:y_true 是真实的分类,y_pred 是预测的分类, labels 是类别的标记索引,
# target_names 是 labels 中对应的类别名称,digits 是报告数据精度
from sklearn.metrics import classification_report
y_true = [0,1,0,1,1,1,0]
y_pred = [0,1,0,1,1,1,1]
labels = [0,1]
target_names = ["label1","label2"]
print classification_report(y_true,y_pred,labels = labels,target_names = target_names)
# confusion_matrix 混淆矩阵
# confusion_matrix(y_true,y_pred) y_true 是真实的分类,y_pred 是预测的分类
from sklearn.metrics import confusion_matrix
y_true = [0,1,0,1,1,1,0]
y_pred = [0,1,0,1,1,1,1]
print confusion_matrix(y_true,y_pred)
'''输出 [[2 1]
[0 4]] '''
关于 sklearn 中其他的内容以后会一直补充 。。。