吴恩达机器学习作业Python实现之logistic回归带正则项

Learningisgood

已于 2022-02-25 14:44:25 修改

阅读量267

点赞数

文章标签：机器学习回归 python

于 2022-02-16 11:02:07 首次发布

本文链接：https://blog.csdn.net/learningisgood/article/details/122958503

版权

import os

os.chdir('E:/ML/machine-learning-ex2')
print('现在工作目录是 '+str(os.getcwd()))

#可视化数据ex2data1.txt

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

ex2data1=pd.read_csv('ex2data1.txt',names=['exam1','exam2','admitted'])
ex2data1.insert(0,'intercept',np.ones(ex2data1.shape[0]))#增加常数项


X=ex2data1.iloc[:,0:3];y=ex2data1.iloc[:,3:]#若写成y=ex2data1.iloc[:,3]，会出现维数问题


#按y值划分数据集

pos=(ex2data1.iloc[:,-1]==1);neg=(ex2data1.iloc[:,-1]==0)
X_pos=np.mat(X[pos]);X_neg=np.mat(X[neg])
fig,ax=plt.subplots()

ax.scatter(X_pos[:,1].flat,X_pos[:,2].flat,marker='x',label='Admitted')
ax.scatter(X_neg[:,1].flat,X_neg[:,2].flat,marker='o',label='Not Admitted')
ax.legend(loc='upper right')
ax.set_xlabel('exam1')
ax.set_ylabel('exam2')
plt.show()

#定义逻辑函数和代价函数；
def sigmoid(z):
    g=1/(1+np.exp(-z))
    return g

def computeCost(theta,X,y,L=0): 
    #代价函数同时返回代价和梯度；
    
    m=len(y)
    X=np.mat(X);y=np.mat(y).reshape(X.shape[0],1);theta=np.mat(theta).reshape(X.shape[1],1)
    #正则项系数
    vec1=np.ones((X.shape[1],1));vec1[0,0]=0
  
    h=sigmoid(X*theta)
    cost=-1/m*(y.T*np.log(h)+(1-y).T*np.log(1-h))+L/2/m*np.multiply(theta,vec1).T*np.multiply(theta,vec1)
    grad=1/m*X.T*(h-y)+L/m*np.multiply(theta,vec1)
    return cost,grad

#定义梯度下降函数------最原始版本
def gradientDescent(theta,X,y,alpha,iterations):
    #原始梯度下降函数，返回最佳theta值和每次循环的代价值列表J，列表J可用来判断函数执行情况是否理想
    #会报错：RuntimeWarning: divide by zero encountered in log

    J=np.zeros((iterations,1))
    for i in range(iterations):
        cost,grad=computeCost(theta,X,y)
        theta=theta-alpha*grad
        J[i]=cost
    return theta,J

#执行梯度下降

theta=np.zeros((X.shape[1],1)) #设置初始theta
theta,J=gradientDescent(theta,X,y,0.1,5000)

#解决方法一：对数据X进行均值标准化，可以减少计算量；
#解决方法二：运用高级梯度下降算法：scipy.optimize.fmin_tnc

#解决方法一：对数据X进行均值标准化，可以减少计算量；


#特征标准化
def normal(X):
    mean_X=np.mean(X);mean_X.iloc[0]=0
    range_X=np.max(X)-np.min(X);range_X.iloc[0]=1 #常数项保持不变，均值定为0，范围定为1；
    normal_X=(X-mean_X)/range_X
    return normal_X


normal_X=normal(X)


#定义梯度下降函数------最原始版本
def gradientDescent(theta,X,y,alpha,iterations,L=0):#L为正则参数
    #原始梯度下降函数，返回最佳theta值和每次循环的代价值列表J，列表J可用来判断函数执行情况是否理想

    J=np.zeros((iterations,1))
    for i in range(iterations):
        cost,grad=computeCost(theta,X,y,L)
        theta=theta-alpha*grad
        J[i]=cost
    return theta,J

#执行梯度下降

theta=np.zeros((X.shape[1],1)) #设置初始theta
theta,J=gradientDescent(theta,normal_X,y,0.1,5000) #此处的最佳theta值匹配的是标准后的数据normal_X;

print('best of theta is:'+str(theta))

fig,axs=plt.subplots(2,1)
axs[0].plot(J)
axs[0].set_ylabel('J')

#画出决策边界
normal_X_pos=np.mat(normal_X[pos]);normal_X_neg=np.mat(normal_X[neg])
axs[1].scatter(normal_X_pos[:,1].flat,normal_X_pos[:,2].flat,marker='x',label='Admitted')
axs[1].scatter(normal_X_neg[:,1].flat,normal_X_neg[:,2].flat,marker='o',label='Not Admitted')
axs[1].legend(loc='upper right')
axs[1].set_xlabel('exam1')
axs[1].set_ylabel('exam2')

X1=np.linspace(-0.5,0.5,30)
intercept=np.ones((30))
X2=(-intercept*theta[0,0]-X1*theta[1,0])/theta[2,0]
axs[1].plot(X1,X2)


plt.show()

#解决方法二：运用高级梯度下降算法：scipy.optimize.fmin_tnc

import scipy.optimize as opt
X=np.mat(X);y=np.mat(y)
theta=np.zeros((X.shape[1],1))
theta=np.mat(theta)
result=opt.fmin_tnc(func=computeCost,x0=theta,args=(X,y))
theta=result[0].reshape(theta.shape) #此处的最佳theta值匹配的是原始数据X;
#画出决策边界

plt.scatter(X_pos[:,1].flat,X_pos[:,2].flat,marker='x',label='pos')
plt.scatter(X_neg[:,1].flat,X_neg[:,2].flat,marker='o',label='neg')
plt.legend(loc='upper right')


X1=np.linspace(25,100,30)
intercept=np.ones((30))
X2=(-intercept*theta[0,0]-X1*theta[1,0])/theta[2,0]
plt.plot(X1,X2)

plt.show()

#可视化数据ex2data2.txt


ex2data2=pd.read_csv('ex2data2.txt',names=['exam1','exam2','admitted'])
ex2data2.insert(0,'intercept',np.ones(ex2data2.shape[0]))#增加常数项


X=ex2data2.iloc[:,0:3];y=ex2data2.iloc[:,3:]#若写成y=ex2data1.iloc[:,3]，会出现维数问题


#按y值划分数据集

pos=(ex2data2.iloc[:,-1]==1);neg=(ex2data2.iloc[:,-1]==0)
X_pos=np.mat(X[pos]);X_neg=np.mat(X[neg])
fig,ax=plt.subplots()

ax.scatter(X_pos[:,1].flat,X_pos[:,2].flat,marker='x',label='Admitted')
ax.scatter(X_neg[:,1].flat,X_neg[:,2].flat,marker='o',label='Not Admitted')
ax.legend(loc='upper right')
ax.set_xlabel('exam1')
ax.set_ylabel('exam2')
plt.show()

XX=X

#扩展特征
def mapFeature(X,degree):
    import numpy as np
    X=np.mat(X)
   
    
    for i in range(2,degree+1):
        for j in range(i+1):
            X=np.concatenate(   (  X,np.multiply( np.power(X[:,1],j),np.power(X[:,2],i-j) )  ),axis=1   )
      
    
    return X

#运用高级梯度下降算法：scipy.optimize.fmin_tnc
X=XX

X=mapFeature(XX,8)
import scipy.optimize as opt
X=np.mat(X);y=np.mat(y)
theta=np.zeros((X.shape[1],1))
theta=np.mat(theta)
result1=opt.fmin_tnc(func=computeCost,x0=theta,args=(X,y,0))#L=0,会出现过拟合；
result2=opt.fmin_tnc(func=computeCost,x0=theta,args=(X,y,2))#L=2，矫正过拟合现象。
theta1=result1[0].reshape(theta.shape) #此处的最佳theta值匹配的是原始数据X;
theta2=result2[0].reshape(theta.shape)


#画出决策边界

fig,axs=plt.subplots(2,1)
axs[0].scatter(X_pos[:,1].flat,X_pos[:,2].flat,marker='x',label='pos')
axs[0].scatter(X_neg[:,1].flat,X_neg[:,2].flat,marker='o',label='neg')
axs[0].legend(loc='upper right')


axs[1].scatter(X_pos[:,1].flat,X_pos[:,2].flat,marker='x',label='pos')
axs[1].scatter(X_neg[:,1].flat,X_neg[:,2].flat,marker='o',label='neg')
axs[1].legend(loc='upper right')

X1=np.linspace(-1,1,30)
X2=np.linspace(-1,1,30)
XX1,XX2=np.meshgrid(X1,X2)
XX_ravel=np.concatenate((np.ones((XX1.size,1)),XX1.ravel().reshape(-1,1),XX2.ravel().reshape(-1,1)),axis=1)

XX_ravel=mapFeature(XX_ravel,8)
z1=XX_ravel*theta1
z2=XX_ravel*theta2
z1=z1.reshape(XX1.shape)
z2=z2.reshape(XX1.shape)

axs[0].contour(XX1,XX2,z1,0)
axs[1].contour(XX1,XX2,z2,0)




plt.show()

上图是忽略正则项的边界，可以看出过拟合；

下图是带正则项的边界，可以看出正则项可以有效减少过拟合。

本例子也说明另一个问题，针对欠拟合问题，可以通过扩展特征来解决。扩展特征的同时带上正则项，更加合理。

吴恩达机器学习编程作业

链接: https://pan.baidu.com/s/1cpMM0xWZ1Dxs8HhVAmeUkA 提取码: a36e

Learningisgood

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
吴恩达机器学习作业Python实现之logistic回归带正则项

import osos.chdir('E:/ML/machine-learning-ex2')print('现在工作目录是 '+str(os.getcwd()))#可视化数据ex2data1.txtimport pandas as pdimport matplotlib.pyplot as pltimport numpy as npex2data1=pd.read_csv('ex2data1.txt',names=['exam1','exam2','admitted'])e.
复制链接

扫一扫