同时也是 第3章 线性模型 3.4答案
3.4 选择两个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率
(这里只选择了UCI的iris数据集:https://archive.ics.uci.edu/ml/index.php)
理论知识:笔记(二)机器学习(周志华)第2章 模型选择和评估——交叉验证
笔记(三)机器学习(周志华)第3章 线性模型——对率回归
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, LeaveOneOut, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from numpy import *
# 加载数据
def loadDataSet():
# iris = datasets.load_iris()
# xArr = iris.data # (150, 4) <class 'numpy.ndarray'>
# labels = iris.target # (150,) 数字。从UCI下载的数据lable是字符需要转换
raw_data = pd.read_csv('iris.data', header=None)
xArr = raw_data.values[:, 0:4] # <class 'numpy.ndarray'>
labels = raw_data.values[:, -1]
labels[labels == 'Iris-setosa'] = 0
labels[labels == 'Iris-versicolor'] = 1
labels[labels == 'Iris-virginica'] = 2
# for i in range(len(labels)):
# if labels[i] == 'Iris-setosa':
# labels[i] = 0
# elif labels[i] == 'Iris-versicolor':
# labels[i] = 1
# else:
# labels[i] = 2
return xArr, labels
# 训练模型
logistModel = LogisticRegression()
def crossValidation(xArr, labels):
score = (cross_val_score(logistModel, xArr, labels, cv=10)).mean() # 每次的得分组成的数组(10,)。 没有shuffle功能
# 等价于下面两句:
# y_pred = cross_val_predict(logistModel, xArr, labels, cv=10)
# score = accuracy_score(y_pred, labels) # 十次交叉验证平均值
return score
# KFold+cross_val_score 具有shuffle功能
def kFold(x, y):
kf = KFold(n_splits=10, shuffle=True, random_state=None) # random_state=None 结果不唯一
score = (cross_val_score(logistModel, x, y, cv=kf, scoring='precision_macro')).mean()
return score #
# 分层采样k折交叉切分
def stratifiedKFold(x, y):
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
score = (cross_val_score(logistModel, x, y, cv=skf)).mean()
return score
# 留一法
def LOO(x, y):
loo = LeaveOneOut()
# loo划分数据方法: loo.split(x)
accCount = 0
for train, test in loo.split(x):
# print(train, '==', test) # [1..149]==[0] [0 2 ...149]==[1] ....
logistModel.fit(x[train], y[train])
y_pred = logistModel.predict(x[test])
if y_pred == y[test]:
accCount += 1
aveScore = accCount / x.shape[0]
return aveScore
if __name__ == '__main__':
xArr, labels = loadDataSet()
# logistModel.fit(xArr, labels) # ValueError: Unknown label type: 'unknown' sklearn无法识别它的类型,y=y.astype('int')
labels = labels.astype('int')
cv_score = crossValidation(xArr, labels)
print(cv_score) # accuracy
loo_score = LOO(xArr, labels)
print(loo_score) # accuracy
kf_score = kFold(xArr, labels)
print(kf_score) # precision_macro
skf_score = stratifiedKFold(xArr, labels)
print(skf_score)