数据统计与分析基础实验(四)Hebuter慎抄

实验四 机器学习算法建模与求解

实验目的掌握使用实用软件通过各类基础的机器学习算法解决实际数据统计分析任务的能力,熟悉线性回归、SVM、kmeans、PCA等算法的调用。

实验内容:

1、对于下表中的数据,对1990年-2005年内的数据建立人口自然增长率对于国民总收入、CPI增长率和人均GDP的三元线性回归模型。

2、下载UCI中wine数据集:http://archive.ics.uci.edu/ml/datasets/Wine。所下载数据可以用txt打开,其中每一行数据为一种Wine的记录,每条记录包含14个维度,其中第一维为该Wine类别,后面13维为具体的Wine属性。请基于所有的178个Wine样本对Wine的13个维度进行PCA降维分析,将贡献率之和大于90%的成分提取,并将原13维属性数据映射为新数据。

3、

(1)随机生成均值、方差各不相同,且相互之间有少量交叉的3个类,每类30个样本,用不同的颜色进行展示。

(2)通过keamns聚类分析,将所有的数据分成3类、4类、5类,每一类用不同颜色展示。

共形成4张图。

4、随机生成完全不交叉的2个类,每个类包含30个样本,用SVM进行分类和返回所有支撑向量,并以合适方式进行Figure展示。

实验代码及结果:

1、

import pandas as pd
from sklearn.linear_model import LinearRegression
import xlrd

data = xlrd.open_workbook('E:\数据分析与统计基础\实验\Project\exp4\dataset.xls')
table = data.sheets()[0]

data_x = pd.DataFrame(
    {"a": table.col_values(1, start_rowx=0, end_rowx=None), "b": table.col_values(2, start_rowx=0, end_rowx=None),
     "c": table.col_values(3, start_rowx=0, end_rowx=None)})
print(data_x)
data_y = pd.Series(table.col_values(0, start_rowx=0, end_rowx=None))
print(data_y)

if __name__ == '__main__':
    reg = LinearRegression()
    reg.fit(data_x, data_y)
    print("偏置:%.8f" % reg.intercept_)
    print("权重向量:")
    print(reg.coef_)

    print("拟合优度:%.8f" % reg.score(data_x, data_y))

 2、

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

if __name__ == '__main__':

    filename = 'E:\数据分析与统计基础\实验\Project\wine.txt'  下,所以不用写具体路径
    pos = []
    with open(filename, 'r') as file_to_read:
        while True:
            lines = file_to_read.readline()  # 整行读取数据
            if not lines:
                break
                pass
            p_tmp = [float(i) for i in lines.split(sep=",")]  
            pos.append(p_tmp)  # 添加新读取的数据
            pass
    pos = np.array(pos)
    pass
    print("训练数据为:")
    print(pos)
    data = np.array(pos)
    # print(data)
    X = data[:, 1:]
    # print(X)
    y = data[:, 0]
    # print(y)
    # 标准化
    X_std = StandardScaler().fit(X).transform(X)
    # 协方差矩阵
    cov_mat = np.cov(X_std.T)
    # 特征值和特征向量
    eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
    tot = sum(eigen_vals)  # 求出特征值的和
    var_exp = [(i / tot) for i in eigen_vals]  # 求出每个特征值占的比例
    print("各属性贡献率为:")
    print(var_exp)
    cum_var_exp = np.cumsum(var_exp)  # 返回var_exp的累积和
    # print(cum_var_exp)
    # 绘图
    plt.bar(range(len(eigen_vals)), var_exp, width=1.0, bottom=0.0, alpha=0.5, label='individual explained variance')
    plt.step(range(len(eigen_vals)), cum_var_exp, where='post', label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.show()
    # [0.36198848 0.55406338 0.66529969 0.73598999 0.80162293 0.85098116
    # 0.89336795 0.9013201  0.92812759 0.94110992 0.96333145 0.98069981
    # 1.        ]
    # 8个达到贡献率之和大于90
    # 保留属性
    n = 0
    for i in range(len(cum_var_exp)):
        if cum_var_exp[i] < 0.9:
            n = n + 1
    n = n + 1
    print(n, "个属性贡献率之和大于90%")
    index = np.argsort(-np.array(var_exp))
    # print(index[:n])
    pca = PCA(n)
    pca.fit(X_std)

    print(pca.explained_variance_ratio_)

    low_d = pca.transform(X_std)  # 降低维度
    # print(low_d.shape)
    # print(low_d)
    # print(sum(pca2.explained_variance_ratio_))
    names = [1, 2, 3]
    ax = plt.figure()
    for c, i, name in zip("rgb", [1, 2, 3], names):
        plt.scatter(low_d[y == i, 0], low_d[y == i, 1], c=c, label=name)
    plt.xlabel('Dimension1')
    plt.ylabel('Dimension2')
    plt.title("wine-standard-PCA")
    plt.legend()
    plt.show()
    # 新属性进行逻辑回归分类
    X_train, X_test, y_train, y_test = train_test_split(low_d, y, test_size=0.4, random_state=0)  # 40% for test
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    print("降维后进行逻辑回归分析测试集上的精确度:%.4f" % lr.score(X_test, y_test))

 

3、

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import argparse

parser = argparse.ArgumentParser(description='K-Means Test')
parser.add_argument('--n', default=3, type=int,
                    help='number of classes to classify')

parser.set_defaults(augment=True)
args = parser.parse_args()
print(args)


def show(label_pred, X, centroids):
    x0 = []
    x1 = []
    x2 = []
    x3 = []
    x4 = []
    for i in range(len(label_pred)):
        if label_pred[i] == 0:
            x0.append(X[i])
        if label_pred[i] == 1:
            x1.append(X[i])
        if label_pred[i] == 2:
            x2.append(X[i])
        if label_pred[i] == 3:
            x3.append(X[i])
        if label_pred[i] == 4:
            x4.append(X[i])
    print(x2)
    print(x1)
    print(x0)

    plt.scatter(np.array(x0)[:, 0], np.array(x0)[:, 1], color='blue', label='label0')
    plt.scatter(np.array(x1)[:, 0], np.array(x1)[:, 1], color='red', label='label1')
    plt.scatter(np.array(x2)[:, 0], np.array(x2)[:, 1], color='yellow', label='label2')
    if args.n == 4:
        plt.scatter(np.array(x3)[:, 0], np.array(x3)[:, 1], color='black', label='label3')
    if args.n == 5:
        plt.scatter(np.array(x3)[:, 0], np.array(x3)[:, 1], color='black', label='label3')
        plt.scatter(np.array(x4)[:, 0], np.array(x4)[:, 1], color='deeppink', label='label4')
    plt.scatter(x=centroids[:, 0], y=centroids[:, 1], marker='*', label='pred_center')
    plt.xlim(-3, 11)
    plt.ylim(-3, 11)
    plt.legend(loc=2)
    plt.show()


def get_data():
    mean = [(1, 1), (2, 5), (5, 2)]
    cov = [np.array([[1, 0], [0, 1]]), np.array([[2, 0], [0, 2]]), np.array([[1.5, 0], [0, 1.5]])]
    x = np.random.multivariate_normal(mean[0], cov[0], (30,), 'raise')  # nx2
    print(x)
    y = np.random.multivariate_normal(mean[1], cov[1], (30,), 'raise')
    print(y)
    z = np.random.multivariate_normal(mean[2], cov[2], (30,), 'raise')
    X = [x.tolist() + y.tolist() + z.tolist()][0]
    print(X)

    plt.scatter(y[:, 0], y[:, 1], color='red', label='class0')
    plt.scatter(x[:, 0], x[:, 1], color='blue', label='class1')
    plt.scatter(z[:, 0], z[:, 1], color='yellow', label='class2')
    plt.xlim(-3, 11)
    plt.ylim(-3, 11)
    plt.legend(loc=2)
    return X


def main():
    X = get_data()
    estimator = KMeans(n_clusters=args.n)  # 构造聚类器
    estimator.fit(X)  # 聚类
    label_pred = estimator.labels_  # 获取聚类标签
    centroids = estimator.cluster_centers_
    print(centroids)
    print(label_pred)
    plt.show()
    show(label_pred, X, centroids)


if __name__ == '__main__':
    main()

 

 

 4、

import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
import argparse

parser = argparse.ArgumentParser(description='K-Means Test')
parser.add_argument('--ratio', default=0.3, type=float,
                    help='ratio of test set and train set')

parser.set_defaults(augment=True)
args = parser.parse_args()
print(args)


def get_data(r):
    mean = [(1, 1), (5, 5)]
    cov = [np.array([[1, 0], [0, 1]]), np.array([[1, 0], [0, 1]])]
    x1 = np.random.multivariate_normal(mean[0], cov[0], (30,))  # nx2
    x2 = np.random.multivariate_normal(mean[1], cov[1], (30,))
    X = [x1.tolist() + x2.tolist()][0]
    # print(X)
    y = [0] * 30 + [1] * 30
    # print(x1)
    plt.scatter(x1[:, 0], x1[:, 1], label='class0')
    plt.scatter(x2[:, 0], x2[:, 1], label='class1')
    plt.xlim(-3, 8)
    plt.ylim(-3, 8)
    plt.legend(loc=2)
    # plt.show()
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=r, random_state=0)
    # print(np.array(x_test))
    # plt.scatter(np.array(x_train)[:, 0], np.array(x_train)[:, 1], label='class0')
    # plt.scatter(np.array(x_test)[:, 0], np.array(x_test)[:, 1], label='class1')
    return x_train, x_test, y_train, y_test


def main(r):
    train_data, test_data, train_label, test_label = get_data(r)
    sv = svm.SVC(gamma='auto', kernel='linear')

    sv.fit(train_data, train_label)
    print("SVM模型训练集的准确率:%.3f" % sv.score(train_data, train_label))
    print("SVM模型测试集的准确率:%.3f" % sv.score(test_data, test_label))
    w = sv.coef_[0]
    print("All support vectors:")
    print(sv.support_vectors_)
    plt.scatter(sv.support_vectors_[:,0],sv.support_vectors_[:,1],color="black",label="support vector")
    # of a line y=a.x +b: the generic w_0x + w_1y +w_3=0 can be rewritten y = -(w_0/w_1) x - (w_3/w_1)
    a = -w[0] / w[1]
    b = -sv.intercept_[0] / w[1]
    xx = np.linspace(-5, 15)
    yy = a * xx + b
    # 斜距式方程:y = kx + b,A(b[0],b[1])为一个支持向量点
    # 第一个支撑向量为第一类的
    b = sv.support_vectors_[0]
    # plt.scatter(b[0], b[1], color="black", label="support vector 0")
    yy_down = a * xx + (b[1] - a * b[0])

    # 斜距式方程:y = kx + b,B(b[0],b[1])为一个支持向量点
    # 最后一个支撑向量为第二类
    b = sv.support_vectors_[-1]
    # plt.scatter(b[0], b[1], color="black", label="support vector 1")
    yy_up = a * xx + (b[1] - a * b[0])
    plt.legend(loc=2)
    plt.plot(xx, yy, 'k-')
    plt.plot(xx, yy_down, 'k--')
    plt.plot(xx, yy_up, 'k--')
    plt.show()


if __name__ == '__main__':
    main(args.ratio)

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Ace2NoU

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值