下面完整代码在github仓库:传送门
一、分类报告、混淆矩阵
import numpy as np
from sklearn import linear_model, svm, neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
# 1.加载数据
iris = datasets.load_iris()
x, y = iris.data, iris.target
# 2. 划分训练集与测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
# 3. 数据预处理
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# 4. 创建模型
clf = svm.SVC(kernel='rbf')
# 5. 模型拟合
clf.fit(x_train, y_train)
# 预测
y_pred = clf.predict(x_test)
# 评估
print(accuracy_score(y_test, y_pred))
# f1_score: F1 = 2*((P*R)/(P+R))
print(f1_score(y_test, y_pred, average='macro'))
# 分类报告
print(classification_report(y_test, y_pred))
# 混淆矩阵
print(confusion_matrix(y_test, y_pred)) # 根据召回率来判断,16个全部分类正确。第二行17个分类正确,1个分类错误。第三行11个全部分类正确
"""
精确度:precision,正确预测为正的,占全部预测为正的比例,TP / (TP+FP)
召回率:recall,正确预测为正的,占全部实际为正的比例,TP / (TP+FN)
F1-score:精确率和召回率的调和平均数,2 * precision*recall / (precision+recall)
类别数量:每类数据标签的数量。
微平均值:micro average,所有数据结果的平均值
宏平均值:macro average,所有标签结果的平均值
加权平均值:weighted average,所有标签结果的加权平均值
(P1 * support1 + P2 * support2 + P2 * support3) / (support1+support2+support3)
"""
二:机器学习的标准化
import numpy as np
import matplotlib.pyplot as plt
def normalization1(x):
# 归一化(0~1): x_ = (x-x_min)/(x_max-x_min)
return [(float(i) - min(x)) / float(max(x) - min(x)) for i in x]
def normalization2(x):
# 均值化: x_ = (x-x_mean) / (x_max-x_min)
return [(float(i) - np.mean(x)) / (max(x) - min(x)) for i in x]
def normalization3(x):
'''标准化(μ=0,σ=1)'''
'''x =(x−μ)/σ'''
x_mean = np.mean(x)
s2 = np.mean([(i - np.mean(x)) ** 2 for i in x])
std = np.sqrt(s2)
return [(i - x_mean) / (s2 + 0.00001) for i in x] # x减均值,除以方差
# return [(i-x_mean)/(std+0.00001) for i in x]#x减均值,除以标准差
def normalization4(x):
# 归一化:只有全是非负数的情况下使用,[-1,1] 公式:x=((x/x_max)-0.5)/0.5
x_mean = [(float(i) / np.max(x) - 0.5) / 0.5 for i in x]
return x_mean
l1 = [-10, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,
11, 12, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 30]
print(np.mean(l1)) # 10
cs = []
for i in l1:
c = l1.count(i)
print(c) # 输出类别:1 2 2 3 3 3...
cs.append(c)
# print(cs) # 列表里面装的是对应数据的标签
n1 = normalization1(l1)
print(n1)
n2 = normalization2(l1)
# print(n2)
n3 = normalization3(l1)
# print(n3)
n4 = normalization4(l1)
# print(n4)
# plt.plot(l1, cs)
# plt.plot(n1, cs)
# plt.plot(n2, cs)
plt.plot(n3, cs)
plt.plot(n4, cs)
plt.show()
三:支持向量机回归
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR, SVC
import sklearn
rng = np.random.RandomState(0)
X = 5 * rng.rand(100, 1) # 二维
y = np.sin(X).ravel() # 一维
# 加噪声
y[::5] += 3 * (0.5 - rng.rand(X.shape[0] // 5)) # 右边部分在-1.5,1.5范围, 每隔5个y数据就计算该范围对应的数据
# y[::5] += 3 * (0.5 - rng.rand(20, 1).ravel())
# print(y[::5])
# 当C越大,趋近无穷的时候,表示不允许分类误差的存在.
# 随着gamma的增大,存在对于测试集分类效果差而对训练分类效果好的情况,并且泛化误差容易出现过拟合。
svr = SVR(kernel='rbf', C=10, gamma=0.1)
svr.fit(X, y)
X_plot = np.linspace(0, 5, 100)
y_svr = svr.predict(X_plot[:, None])
plt.scatter(X, y)
plt.plot(X_plot, y_svr, color="red")
plt.show()
四:计算方差
import numpy as np
# [[2,3],[5,4],[9,6],[4,7],[8,1],[7,2]]
x1 = np.array([2, 5, 9, 4, 8, 7])
y1 = np.array([3, 4, 6, 7, 1, 2])
print(x1.var(), y1.var())
x2 = np.array([2, 5, 4])
y2 = np.array([3, 4, 7])
print(x2.var(), y2.var())
x3 = np.array([8, 9])
y3 = np.array([1, 6])
print(x3.var(), y3.var())