实验四 机器学习算法建模与求解
实验目的:掌握使用实用软件通过各类基础的机器学习算法解决实际数据统计分析任务的能力,熟悉线性回归、SVM、kmeans、PCA等算法的调用。
实验内容:
1、对于下表中的数据,对1990年-2005年内的数据建立人口自然增长率对于国民总收入、CPI增长率和人均GDP的三元线性回归模型。
2、下载UCI中wine数据集:http://archive.ics.uci.edu/ml/datasets/Wine。所下载数据可以用txt打开,其中每一行数据为一种Wine的记录,每条记录包含14个维度,其中第一维为该Wine类别,后面13维为具体的Wine属性。请基于所有的178个Wine样本对Wine的13个维度进行PCA降维分析,将贡献率之和大于90%的成分提取,并将原13维属性数据映射为新数据。
3、
(1)随机生成均值、方差各不相同,且相互之间有少量交叉的3个类,每类30个样本,用不同的颜色进行展示。
(2)通过keamns聚类分析,将所有的数据分成3类、4类、5类,每一类用不同颜色展示。
共形成4张图。
4、随机生成完全不交叉的2个类,每个类包含30个样本,用SVM进行分类和返回所有支撑向量,并以合适方式进行Figure展示。
实验代码及结果:
1、
import pandas as pd
from sklearn.linear_model import LinearRegression
import xlrd
data = xlrd.open_workbook('E:\数据分析与统计基础\实验\Project\exp4\dataset.xls')
table = data.sheets()[0]
data_x = pd.DataFrame(
{"a": table.col_values(1, start_rowx=0, end_rowx=None), "b": table.col_values(2, start_rowx=0, end_rowx=None),
"c": table.col_values(3, start_rowx=0, end_rowx=None)})
print(data_x)
data_y = pd.Series(table.col_values(0, start_rowx=0, end_rowx=None))
print(data_y)
if __name__ == '__main__':
reg = LinearRegression()
reg.fit(data_x, data_y)
print("偏置:%.8f" % reg.intercept_)
print("权重向量:")
print(reg.coef_)
print("拟合优度:%.8f" % reg.score(data_x, data_y))
2、
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
if __name__ == '__main__':
filename = 'E:\数据分析与统计基础\实验\Project\wine.txt' 下,所以不用写具体路径
pos = []
with open(filename, 'r') as file_to_read:
while True:
lines = file_to_read.readline() # 整行读取数据
if not lines:
break
pass
p_tmp = [float(i) for i in lines.split(sep=",")]
pos.append(p_tmp) # 添加新读取的数据
pass
pos = np.array(pos)
pass
print("训练数据为:")
print(pos)
data = np.array(pos)
# print(data)
X = data[:, 1:]
# print(X)
y = data[:, 0]
# print(y)
# 标准化
X_std = StandardScaler().fit(X).transform(X)
# 协方差矩阵
cov_mat = np.cov(X_std.T)
# 特征值和特征向量
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
tot = sum(eigen_vals) # 求出特征值的和
var_exp = [(i / tot) for i in eigen_vals] # 求出每个特征值占的比例
print("各属性贡献率为:")
print(var_exp)
cum_var_exp = np.cumsum(var_exp) # 返回var_exp的累积和
# print(cum_var_exp)
# 绘图
plt.bar(range(len(eigen_vals)), var_exp, width=1.0, bottom=0.0, alpha=0.5, label='individual explained variance')
plt.step(range(len(eigen_vals)), cum_var_exp, where='post', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.show()
# [0.36198848 0.55406338 0.66529969 0.73598999 0.80162293 0.85098116
# 0.89336795 0.9013201 0.92812759 0.94110992 0.96333145 0.98069981
# 1. ]
# 8个达到贡献率之和大于90
# 保留属性
n = 0
for i in range(len(cum_var_exp)):
if cum_var_exp[i] < 0.9:
n = n + 1
n = n + 1
print(n, "个属性贡献率之和大于90%")
index = np.argsort(-np.array(var_exp))
# print(index[:n])
pca = PCA(n)
pca.fit(X_std)
print(pca.explained_variance_ratio_)
low_d = pca.transform(X_std) # 降低维度
# print(low_d.shape)
# print(low_d)
# print(sum(pca2.explained_variance_ratio_))
names = [1, 2, 3]
ax = plt.figure()
for c, i, name in zip("rgb", [1, 2, 3], names):
plt.scatter(low_d[y == i, 0], low_d[y == i, 1], c=c, label=name)
plt.xlabel('Dimension1')
plt.ylabel('Dimension2')
plt.title("wine-standard-PCA")
plt.legend()
plt.show()
# 新属性进行逻辑回归分类
X_train, X_test, y_train, y_test = train_test_split(low_d, y, test_size=0.4, random_state=0) # 40% for test
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("降维后进行逻辑回归分析测试集上的精确度:%.4f" % lr.score(X_test, y_test))
3、
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import argparse
parser = argparse.ArgumentParser(description='K-Means Test')
parser.add_argument('--n', default=3, type=int,
help='number of classes to classify')
parser.set_defaults(augment=True)
args = parser.parse_args()
print(args)
def show(label_pred, X, centroids):
x0 = []
x1 = []
x2 = []
x3 = []
x4 = []
for i in range(len(label_pred)):
if label_pred[i] == 0:
x0.append(X[i])
if label_pred[i] == 1:
x1.append(X[i])
if label_pred[i] == 2:
x2.append(X[i])
if label_pred[i] == 3:
x3.append(X[i])
if label_pred[i] == 4:
x4.append(X[i])
print(x2)
print(x1)
print(x0)
plt.scatter(np.array(x0)[:, 0], np.array(x0)[:, 1], color='blue', label='label0')
plt.scatter(np.array(x1)[:, 0], np.array(x1)[:, 1], color='red', label='label1')
plt.scatter(np.array(x2)[:, 0], np.array(x2)[:, 1], color='yellow', label='label2')
if args.n == 4:
plt.scatter(np.array(x3)[:, 0], np.array(x3)[:, 1], color='black', label='label3')
if args.n == 5:
plt.scatter(np.array(x3)[:, 0], np.array(x3)[:, 1], color='black', label='label3')
plt.scatter(np.array(x4)[:, 0], np.array(x4)[:, 1], color='deeppink', label='label4')
plt.scatter(x=centroids[:, 0], y=centroids[:, 1], marker='*', label='pred_center')
plt.xlim(-3, 11)
plt.ylim(-3, 11)
plt.legend(loc=2)
plt.show()
def get_data():
mean = [(1, 1), (2, 5), (5, 2)]
cov = [np.array([[1, 0], [0, 1]]), np.array([[2, 0], [0, 2]]), np.array([[1.5, 0], [0, 1.5]])]
x = np.random.multivariate_normal(mean[0], cov[0], (30,), 'raise') # nx2
print(x)
y = np.random.multivariate_normal(mean[1], cov[1], (30,), 'raise')
print(y)
z = np.random.multivariate_normal(mean[2], cov[2], (30,), 'raise')
X = [x.tolist() + y.tolist() + z.tolist()][0]
print(X)
plt.scatter(y[:, 0], y[:, 1], color='red', label='class0')
plt.scatter(x[:, 0], x[:, 1], color='blue', label='class1')
plt.scatter(z[:, 0], z[:, 1], color='yellow', label='class2')
plt.xlim(-3, 11)
plt.ylim(-3, 11)
plt.legend(loc=2)
return X
def main():
X = get_data()
estimator = KMeans(n_clusters=args.n) # 构造聚类器
estimator.fit(X) # 聚类
label_pred = estimator.labels_ # 获取聚类标签
centroids = estimator.cluster_centers_
print(centroids)
print(label_pred)
plt.show()
show(label_pred, X, centroids)
if __name__ == '__main__':
main()
4、
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
import argparse
parser = argparse.ArgumentParser(description='K-Means Test')
parser.add_argument('--ratio', default=0.3, type=float,
help='ratio of test set and train set')
parser.set_defaults(augment=True)
args = parser.parse_args()
print(args)
def get_data(r):
mean = [(1, 1), (5, 5)]
cov = [np.array([[1, 0], [0, 1]]), np.array([[1, 0], [0, 1]])]
x1 = np.random.multivariate_normal(mean[0], cov[0], (30,)) # nx2
x2 = np.random.multivariate_normal(mean[1], cov[1], (30,))
X = [x1.tolist() + x2.tolist()][0]
# print(X)
y = [0] * 30 + [1] * 30
# print(x1)
plt.scatter(x1[:, 0], x1[:, 1], label='class0')
plt.scatter(x2[:, 0], x2[:, 1], label='class1')
plt.xlim(-3, 8)
plt.ylim(-3, 8)
plt.legend(loc=2)
# plt.show()
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=r, random_state=0)
# print(np.array(x_test))
# plt.scatter(np.array(x_train)[:, 0], np.array(x_train)[:, 1], label='class0')
# plt.scatter(np.array(x_test)[:, 0], np.array(x_test)[:, 1], label='class1')
return x_train, x_test, y_train, y_test
def main(r):
train_data, test_data, train_label, test_label = get_data(r)
sv = svm.SVC(gamma='auto', kernel='linear')
sv.fit(train_data, train_label)
print("SVM模型训练集的准确率:%.3f" % sv.score(train_data, train_label))
print("SVM模型测试集的准确率:%.3f" % sv.score(test_data, test_label))
w = sv.coef_[0]
print("All support vectors:")
print(sv.support_vectors_)
plt.scatter(sv.support_vectors_[:,0],sv.support_vectors_[:,1],color="black",label="support vector")
# of a line y=a.x +b: the generic w_0x + w_1y +w_3=0 can be rewritten y = -(w_0/w_1) x - (w_3/w_1)
a = -w[0] / w[1]
b = -sv.intercept_[0] / w[1]
xx = np.linspace(-5, 15)
yy = a * xx + b
# 斜距式方程:y = kx + b,A(b[0],b[1])为一个支持向量点
# 第一个支撑向量为第一类的
b = sv.support_vectors_[0]
# plt.scatter(b[0], b[1], color="black", label="support vector 0")
yy_down = a * xx + (b[1] - a * b[0])
# 斜距式方程:y = kx + b,B(b[0],b[1])为一个支持向量点
# 最后一个支撑向量为第二类
b = sv.support_vectors_[-1]
# plt.scatter(b[0], b[1], color="black", label="support vector 1")
yy_up = a * xx + (b[1] - a * b[0])
plt.legend(loc=2)
plt.plot(xx, yy, 'k-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
plt.show()
if __name__ == '__main__':
main(args.ratio)