下载文件地址:
链接: https://pan.baidu.com/s/1VdjCSw9MfKZ1WAcaiDyi8w 提取码: wtck
"""
学习曲线 分别使用 不同大小的训练数据集
检验正确率
"""
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp
# 读取数据
lines = np.loadtxt(r".\car.txt", delimiter=',', dtype='str')
print(lines.shape)
# 整理样本空间 并编码 (对列执行标签编码)
train_x, train_y = [], []
encoders = [] # 标签编码数组
for index, row in enumerate(lines.T):
encoder = sp.LabelEncoder()
if index < (len(lines.T) - 1): # 训练样本 X
train_x.append(encoder.fit_transform(row))
else:
train_y = encoder.fit_transform(row) # 训练样本结果 Y
encoders.append(encoder)
train_x = np.array(train_x).T # 转置
train_y = np.array(train_y)
print(train_x.shape, train_y.shape)
print(train_x[0], train_y[0])
# 随机数森林分类器模型
model = se.RandomForestClassifier(
max_depth=9, n_estimators=140, random_state=7)
# 学习曲线 训练数据大小 0.1 - 1.0 步长 0.1
train_sizes = np.arange(0.1, 1.0, 0.1)
_, train_scores, test_scores = ms.learning_curve(model, train_x, train_y,
train_sizes=train_sizes, cv=6)
test_mean = test_scores.mean(axis=1)
# 从结果看 训练大小以0.9结果最好
# 绘制学习曲线
mp.grid(linestyle=":")
mp.plot(train_sizes, test_mean,
"o-", color="dodgerblue", label="learning curve")
mp.legend()
mp.show()
model.fit(train_x, train_y)
# 自定义测试数据 用已经训练好的模型进行测试
data = [
['high', 'med', '5more', '4', 'big', 'low', 'unacc'],
['high', 'high', '4', '4', 'med', 'med', 'acc'],
['low', 'low', '2', '4', 'small', 'high', 'good'],
['low', 'med', '3', '4', 'med', 'high', 'vgood']]
data = np.array(data)
test_x, test_y = [], []
for index, row in enumerate(data.T):
encoder = encoders[index]
if index < (len(data.T) - 1): # 添加到输入集
test_x.append(encoder.transform(row))
else:
test_y = encoder.transform(row)
test_x = np.array(test_x).T
test_y = np.array(test_y)
print(test_x.shape, test_y.shape)
print(test_x[0], test_y[0])
prd_test_y = model.predict(test_x)
print(test_y)
print(prd_test_y)
print(encoders[-1].inverse_transform(test_y))
print(encoders[-1].inverse_transform(prd_test_y))