Week2 Python之机器学习_data=np.hstack(x.reshape(1000,1)-CSDN博客

本文链接：https://blog.csdn.net/lingzzz/article/details/122713917

本文探讨了多项式回归在处理二次和三次多项式数据集的应用，通过调整多项式阶数减少过拟合。同时，介绍了逻辑回归的简单分类与多元分类实例，包括马疝气病和鸢尾花数据集的处理，展示了不同策略下的模型性能和决策边界绘制。

摘要由CSDN通过智能技术生成

1 多项式回归

1.1 生成数据集

我们首先通过Numpy的随机采样函数获取数据集，同时添加一定的噪声，并将其保存至文件中。

import numpy as np
x = np.random.uniform(-3,3,size = (100,1))
y = 2 * x ** 2 + 3 * x + np.random.normal(0,1,size = (100,1))
data = np.hstack((x, y))
np.savetxt("dataset.txt",data,delimiter = ",")

1.2 读取数据集并可视化

同上篇的方法将数据读取，并通过Matplotlib绘制散点图像。

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

data = np.loadtxt('data3.txt',delimiter = ',')
X = data[:,0]
y = data[:,1]
plt.figure()
plt.plot(X,y,'r.')
plt.show()

散点图：

1.3 模型选择（sklearn版本）

我们首先使用上篇的线性回归模型进行试验

我们可以看到，其训练后的函数拟合效果不尽人意，故而使用线性回归模型来描述该数据集显然是不全面的。我们便想到通过多项式函数来拟合这种关系更为复杂的数据，其公式有：

$f(x) = \sum_{i=0}^{n}k^{[i]}x^{i}$

十次多项式示例：

from sklearn import pipeline
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics 
import matplotlib.pyplot as plt
import numpy as np

data = np.loadtxt('data3.txt',delimiter = ',')
X = data[:,0].reshape(-1,1)
y = data[:,1].reshape(-1,1)
plt.figure()
plt.plot(X,y,'r.')

model = pipeline.make_pipeline(
preprocessing.PolynomialFeatures(10), #我们可以通过调节该值得到一个较为平滑的曲线
linear_model.LinearRegression()
)

model.fit(X,y)

xp = np.linspace(X.min(),X.max(),1000)
xp = xp.reshape(-1, 1)
yp = model.predict(xp)
plt.plot(xp,yp,label='PolyFit Line')

plt.show()

可见，上图的函数并非十分平滑，普遍性较低，有一定过拟合现象。我们可以通过降低函数的次数来使曲线变得平滑，减少过拟合现象。

preprocessing.PolynomialFeatures(2), #我们可以通过调节该值得到一个较为平滑的曲线

1.4 手撸版本

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

def CostFunction(X,y,theta): #损失函数
    m = np.size(y)
    J = 0
    J = sum(np.square(np.dot(X,theta) - y)) / (2 * m)
    return J

def GradientDescent(X,y,n,theta,alpha,steps): #梯度下降
    m = np.size(y)
    J_list = np.zeros([steps,1])
    for i in range(0,steps):
        h = np.dot(X,theta)
        temp = np.zeros([n + 1,1])
        for j in range(0,m):
            temp = temp + (np.dot((h[j] - y[j]).reshape(1,1),X[j,:].reshape(1,-1))).reshape(-1,1)
        theta = theta - (alpha * temp) / m
        J_list[i] = CostFunction(X,y,theta)
    return theta

def PowerData(X,n): #对自变量向量进行幂次方处理
    l = np.size(X)
    X_New = np.ones([l,1])
    for i in range(1,n + 1):
        X_New = np.hstack([X_New,np.power(X,i)])
    return X_New

data = np.loadtxt('data3.txt',delimiter = ',')
X = data[:,0].reshape(-1,1)
y = data[:,1].reshape(-1,1)
xp = np.linspace(X.min(),X.max(),1000).reshape(-1,1)
plt.figure()
plt.plot(X,y,'r.')

n = 2                          #多项式次数
theta = np.zeros([n + 1,1])    #初始化系数向量
steps = 1500                   #迭代次数
alpha = 0.01                   #学习率

X = PowerData(X,n)
theta = GradientDescent(X,y,n,theta,alpha,steps)
xp = PowerData(xp,n)
yp = np.dot(xp,theta)
plt.plot(xp[:,1],yp,label='PolyFit Line')
print(theta)
plt.show()

图像：

对于生成的三次项数据集的拟合效果：

2 逻辑回归

2.1 简单分类模型

2.1.1 导入数据集并绘制图像

数据集

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

data = np.loadtxt('testSet.txt')
X = data[:,0:2].reshape(-1,2)
y = data[:,2].reshape(-1,1)
l = np.size(y)
plt.figure()
for i in range(0,l):
    if(y[i] == 1):
        plt.plot(X[i,0],X[i,1],'r.')
    else:
        plt.plot(X[i,0],X[i,1],'b.')

图像（我们要做的便是画一条函数图线，以此区分开红色与蓝色的点）：

2.1.2 模型构建

我们观察图像发现，图像上的红点和蓝点可以用一条直线近似地分割开来，我们称所生成地这根直线为决策边界，逻辑回归模型所要实现的功能便是寻找决策边界的函数表达式。

我们将二分类问题看成是将数值归类为0和1的过程，也可以理解为判断一个物体是否为目标的概率，故而需要使用一个值域在(0,1)内的函数。我们引入Sigmoid函数: $f(x) = \frac{1}{1 + e^{-x}}$ ，并通过其良好的性质，从而作为逻辑回归下假设函数的一部分。

假设函数： $h(x) = \frac{1}{1 + e^{-\Theta^{T} x}}$

在求解损失函数时，我们为了使其为一个凸函数，因此引入对数运算处理指数项，从而得到损失函数公式： $L(x) = -(yln(h_{\Theta }(x)) + (1-y)ln(1-h_{\Theta }(x)))$

综上，我们便可以应用梯度下降法使损失函数最小化，并求得此时的系数解。

2.1.3 sklearn版本

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.linear_model import LogisticRegression

def Normalization(X):
    range_ = np.max(X) - np.min(X)
    return (X - np.min(X)) / range_

data = np.loadtxt('testSet.txt')
X = data[:,0:2].reshape(-1,2)
X = Normalization(X)
xp = np.linspace(X[:,0].min(),X[:,0].max(),100)
y = data[:,2].reshape(-1,1)
plt.figure()
for i in range(0,l):
    if(y[i] == 1):
        plt.plot(X[i,0],X[i,1],'r.')
    else:
        plt.plot(X[i,0],X[i,1],'g.')
model = LogisticRegression()
model.fit(X,y)
yp = -(model.intercept_ + model.coef_[0,0] * xp) / model.coef_[0,1]
plt.plot(xp,yp)
print(model.coef_)
print(model.intercept_)
plt.show()

决策边界图像：

2.1.4 手撸版本

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

def Normalization(X):    #数据标准化处理
    range_ = np.max(X) - np.min(X)
    return (X - np.min(X)) / range_

def SigmoidFunction(x):
    return 1.0 / (1 + np.exp(-x))

def ClassificationFunction(X,theta):
    return np.dot(X,theta)

def GradientDescent(X,y,t,theta,alpha,steps): #梯度下降
    m = np.size(y)
    J_list = np.zeros([steps,1])
    for i in range(0,steps):
        h = SigmoidFunction(ClassificationFunction(X,theta))
        temp = np.zeros([t,1])
        for j in range(0,m):
            temp = temp + ((h[j] - y[j]).reshape(1,1) * X[j,:].reshape(1,-1)).reshape(-1,1)
        theta = theta - (alpha * temp) / m
        J_list[i] = CostFunction(X,y,theta)
    return theta

def PowerData(X,n,r): #对自变量向量进行幂次方处理
    l = int(np.size(X) / r)
    X_New = np.ones([l,1])
    for i in range(1,n + 1):
        X_New = np.hstack([X_New,np.power(X,i)])
    return X_New

data = np.loadtxt('testSet.txt')
X = data[:,0:2].reshape(-1,2)
X = Normalization(X)
xp = np.linspace(X[:,0].min(),X[:,0].max(),100)
y = data[:,2].reshape(-1,1)
l = np.size(y)
plt.figure()
for i in range(0,l):
    if(y[i] == 1):
        plt.plot(X[i,0],X[i,1],'r.')
    else:
        plt.plot(X[i,0],X[i,1],'g.')

n = 1                   #多项式次数
r = 2                   #未知元个数
t = r * n + 1
theta = np.zeros([t,1]) #初始化系数向量
steps = 4500            #迭代次数
alpha = 0.01           #学习率

X = PowerData(X,n,r)
print(X)
theta = GradientDescent(X,y,t,theta,alpha,steps)
yp = -(theta[0] + theta[1] * xp) / theta[2]
plt.plot(xp,yp)
print(theta)
plt.show()

决策边界图像：

2.2 多元分类之“马疝气病数据集”

马疝气病数据集是分类算法的经典数据集，该数据集包含了368个样本和28个特征，其最终反映马匹在获得疝气病后的死亡情况。我们的算法便是需要从该数据集中寻找到马疝气病的特征与死亡情况，最终得以预测。

我们将数据集分为训练集与测试集两部分，通过训练集训练模型，然后通过测试集测试模型的可靠性。

数据集下载链接：提取码：1234 提取码：1234

2.2.1 sklearn版本

import numpy as np
from sklearn.linear_model import LogisticRegression

def Normalization(X):
    range_ = np.max(X) - np.min(X)
    return (X - np.min(X)) / range_

def ReadData(file):
    data = np.loadtxt(file)
    X = data[:,0:21].reshape(-1,21)
    X = Normalization(X)
    y = data[:,21].reshape(-1,1)
    return X,y

X,y = ReadData('horseColicTraining.txt')
xp,yp = ReadData('horseColicTest.txt') 
l = np.size(y)
model = LogisticRegression()
model.fit(X,y)
t = model.score(xp,yp)
print("测试正确率为：%.2f" % (t * 100))

输出结果：

测试正确率为：76.12

2.2.2 手撸版本

我们将上节的二元Logistic回归中的数据矩阵由二维升至多维，即可解决多元分类的问题。

import numpy as np

def Normalization(X):
    range_ = np.max(X) - np.min(X)
    return (X - np.min(X)) / range_

def SigmoidFunction(x):
    return 1.0 / (1 + np.exp(-x))

def ClassificationFunction(X,theta):
    return np.dot(X,theta)

def CostFunction(X,y,theta): #损失函数
    m = np.size(y)
    J = 0
    J = sum(np.square(np.dot(X,theta) - y)) / (2 * m)
    return J

def GradientDescent(X,y,t,theta,alpha,steps): #梯度下降
    m = np.size(y)
    J_list = np.zeros([steps,1])
    for i in range(0,steps):
        h = SigmoidFunction(ClassificationFunction(X,theta))
        temp = np.zeros([t,1])
        for j in range(0,m):
            temp = temp + ((h[j] - y[j]).reshape(1,1) * X[j,:].reshape(1,-1)).reshape(-1,1)
        theta = theta - (alpha * temp) / m
        J_list[i] = CostFunction(X,y,theta)
    return theta

def ModelTest(X,theta,y):                #测试模型准确率
    t = 0
    ans = SigmoidFunction(ClassificationFunction(X,theta))
    ans = abs(y - ans)
    m = np.size(ans)
    for i in range(0,m):
        if(ans[i] < 0.5):
            t = t + 1
    return t

def PowerData(X,n,r): #对自变量向量进行幂次方处理
    l = int(np.size(X) / r)
    X_New = np.ones([l,1])
    for i in range(1,n + 1):
        X_New = np.hstack([X_New,np.power(X,i)])
    return X_New

def ReadData(file):
    data = np.loadtxt(file)
    X = data[:,0:21].reshape(-1,21)
    X = Normalization(X)
    y = data[:,21].reshape(-1,1)
    return X,y

n = 1                   #多项式次数
r = 21                  #未知元个数
t = r * n + 1
theta = np.zeros([t,1]) #初始化系数向量
steps = 4500            #迭代次数
alpha = 0.01            #学习率

X,y = ReadData('horseColicTraining.txt')
X = PowerData(X,n,r)
theta = GradientDescent(X,y,t,theta,alpha,steps)

xp,yp = ReadData('horseColicTest.txt')
xp = PowerData(xp,n,r)
t = ModelTest(xp,theta,yp)
print("测试正确率为：%.2f" % (t / np.size(yp) * 100))  #输出测试集验证的模型的准确率

输出结果：

测试正确率为：71.64

结果表明，该模型的预测准确率是较高的。

倘若我们更改一下多项式函数的最高项次数，即：

n = 2                   #多项式次数

其输出结果为：

测试正确率为：73.13

可见在此示例中，提高函数的最高项次数对模型的预测准确率有一定优化，不过并不明显。同时，过大地提高其最高项次数可能产生过拟合的后果，从而降低模型的泛化能力。

2.3 多分类模型之“鸢尾花分类数据集”

2.3.1 “一对其余”分类法

在将一个数据集样本分类为n种类型时，我们可通过训练n个分类器，并通过区分一个样本与其余样本的方式将其简化为n个二分类问题，最后经比对各分类器中返回概率的最高值为分类结果，从而实现多特征分类。

鸢尾花数据集下载地址：UCI数据库

我们将下载好的数据库随机分成训练集与测试集两部分，然后通过Logistic回归模型训练参数。

2.3.2 程序总览

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

def Normalization(X):
    range_ = np.max(X) - np.min(X)
    return (X - np.min(X)) / range_

def SigmoidFunction(x):
    return 1.0 / (1 + np.exp(-x))

def ClassificationFunction(X,theta):
    return np.dot(X,theta)

def CostFunction(X,y,theta): #损失函数
    m = np.size(y)
    J = 0
    J = sum(np.square(SigmoidFunction(ClassificationFunction(X,theta)) - y)) / (2 * m)
    return J

def GradientDescent(X,y,t,theta,alpha,steps): #梯度下降
    time = 0
    m = np.size(y)
    J_list = np.zeros([steps,1])
    for i in range(0,steps):
        time = time + 1
        if(time % (steps / 100) == 0):
            print(time / (steps / 100))
        h = SigmoidFunction(ClassificationFunction(X,theta))
        temp = np.zeros([t,1])
        for j in range(0,m):
            temp = temp + ((h[j] - y[j]).reshape(1,1) * X[j,:].reshape(1,-1)).reshape(-1,1)
        theta = theta - (alpha * temp) / m
        J_list[i] = CostFunction(X,y,theta)
    return theta,J_list

def ModelTrain(X,y,theta,t,alpha,steps,n,J_list):
    for i in range(n):
        theta[i],J_list[i] = GradientDescent(X,y[i],t,theta[i],alpha,steps)
    return theta,J_list

def ModelPredict(xp,theta,n):
    m,s = xp.shape
    yp = np.zeros([m,1])
    for i in range(m):
        max_ = [0,0]
        for j in range(n):
            h = SigmoidFunction(ClassificationFunction(xp[i,:],theta[j]))
            print(h)
            if(max_[0] < h):
                max_[0] = h
                max_[1] = j
        yp[i] = max_[1] + 1
    return yp    

def ModelTest(yp,y):                #测试模型准确率
    t = 0
    yp = abs(y - yp)
    m = np.size(yp)
    for i in range(0,m):
        if(yp[i] < 0.5):
            t = t + 1
    return t

def PowerData(X,n,r): #对自变量向量进行幂次方处理
    l = int(np.size(X) / r)
    X_New = np.ones([l,1])
    for i in range(1,n + 1):
        X_New = np.hstack([X_New,np.power(X,i)])
    return X_New

def MutiClassifier(y,n):
    m = np.size(y)
    y_data = [np.zeros([m,1]) for x in range(n)]
    for i in range(1,n + 1):
        for j in range(0,m):
            if(y[j] == i):
                y_data[i-1][j] = 1
            else:
                y_data[i-1][j] = 0
    return y_data

def ReadData(file):
    data = np.loadtxt(file)
    X = data[:,0:4].reshape(-1,4)
    X = Normalization(X)
    y = data[:,4].reshape(-1,1)
    return X,y

def Drawfigure(x,J_list):
    plt.figure()
    ax1 = plt.subplot(2,2,1)
    plt.plot(x,J_list[0]*10000)
    plt.ylabel('Classifier_1')
    ax2 = plt.subplot(2,2,2)
    plt.plot(x,J_list[1]*10000)
    plt.ylabel('Classifier_2')
    ax3 = plt.subplot(2,2,3)
    plt.plot(x,J_list[2]*10000)
    plt.ylabel('Classifier_3')
    ax4 = plt.subplot(2,2,4)
    plt.plot(x,sum(J_list)*10000)
    plt.ylabel('Sum of Cost-function')
    plt.show()


c = 3                                       #种类数量
n = 1                                       #多项式次数
r = 4                                       #未知元个数
t = r * n + 1
theta = [np.zeros([t,1]) for x in range(c)] #初始化系数向量
steps = 500000                                #迭代次数
J_list = [np.zeros([steps,1]) for x in range(c)]
alpha = 0.008                                #学习率

X,y = ReadData('irisTrain.txt')
X = PowerData(X,n,r)
y_data = MutiClassifier(y,c)
theta,J_list = ModelTrain(X,y_data,theta,t,alpha,steps,c,J_list)

xp,y_ans = ReadData('irisTest.txt')
xp = PowerData(xp,n,r)
yp = ModelPredict(xp,theta,c)
t = ModelTest(yp,y_ans)
print("测试正确率为：%.2f" % (t / np.size(yp) * 100))  #输出测试集验证的模型的准确率
#print(yp)
x = np.array([x+1 for x in range(steps)]).reshape(-1,1)
Drawfigure(x,J_list)

输出结果：