scikit-learn学习网站
https://scikit-learn.org/stable/
基本流程:
1、基本模型流程
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import numpy as np
iris = datasets.load_iris()
digits = datasets.load_digits()
# print(iris.data)
# print(iris.data.shape)#(150,4) 150个样本,每个样本四个特征数据
# print(iris.target_names)#['setosa' 'versicolor' 'virginica'] 三个标签名字
# print(iris.target)#150个样本的分类 上面的名字对应成0 1 2
print(digits.data.shape) # (1797,64)
# 手动划分训练集、测试集
n_test = 100 # 测试样本个数
train_X = digits.data[:-n_test, :] # 第0行到倒数100行样本,所有列
train_y = digits.target[:-n_test] # 样本标签
test_X = digits.data[-n_test:, :]
y_true = digits.target[-n_test:]
# 选择SVM模型
svm_model = svm.SVC(gamma=0.001, C=100.) # 自己选的超参数
# svm_model = svm.SVC(gamma=100., C=1.)
# 训练模型
svm_model.fit(train_X, train_y)
# 选择LR逻辑回归模型
lr_model = LogisticRegression()
# 训练模型
lr_model.fit(train_X, train_y)
# 在测试集上测试模型
y_pred_svm = svm_model.predict(test_X)
y_pred_lr = lr_model.predict(test_X)
# 查看结果
# print '预测标签:', y_pred
# print '真实标签:', y_true
print('SVM结果:', accuracy_score(y_true, y_pred_svm)) # 0.98
print('LR结果:', accuracy_score(y_true, y_pred_lr)) # 0.97
#保存模型
with open('svm_model.pkl', 'wb') as f:#wb二进制的方法写
pickle.dump(svm_model, f)
# 重新加载模型进行预测
with open('svm_model.pkl', 'rb') as f:
model = pickle.load(f)
random_samples_index = np.random.randint(0, 1796, 5)
random_samples = digits.data[random_samples_index, :]#数据集里随机选5个数
random_targets = digits.target[random_samples_index]
random_predict = model.predict(random_samples)
print(random_predict)#【6 2 5 6 4】
print(random_targets)#【6 2 5 6 4】
2、特征归一化
import numpy as np
from sklearn.model_selection import train_test_split
#准备数据集
X = np.random.randint(0, 100, (10, 4))#十行四列矩阵,即十个样本,每个样本四个特征维度
y = np.random.randint(0, 3, 10)#0到3中随机取十个数,作为标签
y.sort()
print('样本:')
print(X)
print('标签:', y)
# 分割训练集、测试集
# random_state确保每次随机分割得到相同的结果
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=7)
print('训练集:')
print(X_train)
print(y_train)
print('测试集:')
print(X_test)
print(y_test)
# 特征归一化
from sklearn import preprocessing
x1 = np.random.randint(0, 1000, 5).reshape(5, 1)#第一个特征 五个样本
x2 = np.random.randint(0, 10, 5).reshape(5, 1)
x3 = np.random.randint(0, 100000, 5).reshape(5, 1)
X = np.concatenate([x1, x2, x3], axis=1)
print(X)
#数字跨度太大,归一化到相同数量级
print(preprocessing.scale(X))#默认中心归一化
生成分类数据进行验证scale的必要性
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2,
random_state=25, n_clusters_per_class=1, scale=100)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
from sklearn import svm, preprocessing
# 注释掉以下这句表示不进行特征归一化
#X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=7)
svm_classifier = svm.SVC()
svm_classifier.fit(X_train, y_train)
print(svm_classifier.score(X_test, y_test))
不归一化是0.97,归一化是0.98
3、交叉验证
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier#只要选取一个参数k
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=10)
k_range = range(1, 31)
cv_scores = []
for n in k_range:
knn = KNeighborsClassifier(n)
#cv是选10轮不同数据,然后取平均
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') # 分类问题使用
# scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='neg_mean_squared_error') # 回归问题使用
cv_scores.append(scores.mean())
plt.plot(k_range, cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
#选择最佳k
best_knn = KNeighborsClassifier(n_neighbors=5)
best_knn.fit(X_train, y_train)
print(best_knn.score(X_test, y_test))
print(best_knn.predict(X_test))
#5的时候score是0.96,27的时候是0.94
4、过拟合与欠拟合---超参数的选择
sklearn.svn.SVC
一般需要选择C,kernel,gamma三个比较重要的超参数
①学习曲线learning_curve
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
# 加载数据
digits = load_digits()
X = digits.data
y = digits.target
# gamma=0.001
train_sizes, train_scores, val_scores = learning_curve(
SVC(gamma=0.001), X, y, cv=10, scoring='accuracy',
train_sizes=[0.1, 0.25, 0.5, 0.75, 1]
)#cv=10,把数据分成10份,一份做验证集,10折交叉验证
#循环5次,trainsize代表取多少作为训练集,分别跑一次十折
# 在10折的交叉验证数据上进行平均
train_scores_mean = np.mean(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
# 绘制学习曲线
plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='training')
plt.plot(train_sizes, val_scores_mean, '*-', color='g', label='cross validation')
plt.xlabel('training sample size')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.show()
gamma=0.001时,在测试集和训练集都越来越好,是刚刚好的情况
gamma=0.1时,即使训练样本少在训练集也一直是百分之百,但在测试集很差,过拟合了
②验证集validation_curve
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
# 加载数据
digits = load_digits()
X = digits.data
y = digits.target
print(X.shape)
print(y)
#param_range = np.arange(1, 6) / 500.
param_range = np.logspace(-6.5, -2, 10)#选对数画出来好看一些
print(param_range)
train_scores, val_scores = validation_curve(
SVC(), X, y, param_name='gamma', param_range=param_range,
cv=5, scoring='accuracy')#param_name代表设置SVC里gamma这个参数
#gamma取-6.5,-2,10 的时候分别跑五折
# 在5折的交叉验证数据上进行平均
train_scores_mean = np.mean(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
# 绘制学习曲线
plt.plot(param_range, train_scores_mean, 'o-', color='r', label='training')
plt.plot(param_range, val_scores_mean, '*-', color='g', label='cross validation')
plt.xlabel('gamma')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.show()
gamma取0.001之后基本就是过拟合了
③选择C,kernel,gamma三个超参数调整
使用GridSearch网格搜索
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets, svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0)
# 设置参数调整的范围及配置,定义的两个模型,共12种组合
param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},#线性回归没有gamma这个参数
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},#高斯回归
]
svm_model = svm.SVC()
# 将超参数配置及模型放入GridSearch中进行自动搜索
clf = GridSearchCV(svm_model, param_grid, cv=5)
clf.fit(X_train, y_train)
# 获取选择的最优模型
best_model = clf.best_estimator_
# 查看选择的最优超参数配置
print(clf.best_params_)#{'C': 10, 'kernel': 'linear'}
# 预测
y_pred = best_model.predict(X_test)
print('accuracy', accuracy_score(y_test, y_pred))#accuracy 1.0
5、特征选择
(特征降维–PCA算协方差矩阵,然后得到特征值特征向量,特征值从大到小排序进行选择,对应的特征向量就是选择的新的向量。并不是原有特征中的任何一个,而是进行了特征映射)
(特征选择–选出来的特征就是原有特征中的一个或几个)
①去除方差小的特征
# 1. 去除方差小的特征
from sklearn.feature_selection import VarianceThreshold
# 6个样本,3维的特征向量
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
# 根据方差保留80%的向量
# 计算公式:var_thresh = p(1-p)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
print(sel.fit_transform(X))
三维向量选取了两维
输出:
[[0 1]
[1 0]
[0 0]
[1 1]
[1 0]
[1 1]]
②基于单变量统计特征选择
# 2. 基于单变量统计特征选择
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
print('原始特征:')
print(X.shape)
print(X[:5, :])
# 使用卡方分布选择2个维度的变量
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)#四维特征选取了两维
print('选取的特征:')
print(X_new.shape)
print(X_new[:5, :])
输出:
原始特征:
(150, 4)
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]]
选取的特征:
(150, 2)
[[1.4 0.2]
[1.4 0.2]
[1.3 0.2]
[1.5 0.2]
[1.4 0.2]]
③ 基于模型的特征选择
# 3. 基于模型的特征选择
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
print('原始特征:')
print(X.shape)
print(X[:5, :])
clf = RandomForestClassifier()
clf = clf.fit(X, y)
print('特征得分:')
print(clf.feature_importances_ )
# 基于随机森林选择特征
model = SelectFromModel(clf, prefit=True)#true代表之前训练好了,默认参数threshold为2 ,即选择两个参数
X_new = model.transform(X)
print('选取的特征:')
print(X_new.shape)
print(X_new[:5, :])
输出:
原始特征:
(150, 4)
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]]
特征得分:
[0.0985073 0.03100764 0.43976362 0.43072144]
选取的特征:
(150, 2)
[[1.4 0.2]
[1.4 0.2]
[1.3 0.2]
[1.5 0.2]
[1.4 0.2]]
6、评价指标补充
多种指标简单介绍
详细介绍
①曲线下面积(AUC)
AUC就是特征曲线ROC下的面积,用于二分类,当输出预测值不是简单的标签,而是一个概率值
②对数损失logloss