吴恩达机器学习ex2 python实现

这个项目包含了吴恩达机器学习ex2的python实现,主要知识点为逻辑回归、正则化。

1.逻辑回归

构建一个逻辑回归模型来预测,某个学生是否被大学录取。对于每一个训练样本,你有他们两次测试的评分和最后是被录取的结果。

1.1查看数据分布

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path=r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex2\machine-learning-ex2\ex2\ex2data1.txt'
data=pd.read_csv(path,header=None,names=['Exam 1','Exam 2','Admitted'])
data.head()
Exam 1Exam 2Admitted
034.62366078.0246930
130.28671143.8949980
235.84740972.9021980
360.18259986.3085521
479.03273675.3443761
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()

在这里插入图片描述

1.2定义各种函数

定义sigmoid函数在这里插入图片描述

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

定义代价函数

在这里插入图片描述

假设函数

在这里插入图片描述

def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    return np.sum(first - second) / (len(X))

定义梯度下降
在这里插入图片描述

def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        grad[i] = np.sum(term) / len(X)
    
    return grad

1.3数据预处理

data.insert(0, 'Ones', 1)

# 初始化X,y,θ
cols = data.shape[1]
X = data.iloc[:,:-1]
y = data.iloc[:,cols-1:cols]
theta = np.zeros(3)

# 转换X,y的类型
X = np.array(X.values)
y = np.array(y.values)
X.shape, theta.shape, y.shape
((100, 3), (3,), (100, 1))

1.4利用模型进行预测

cost(theta, X, y)
0.6931471805599453

调用库函数实现梯度下降

import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result
(array([-25.16131872,   0.20623159,   0.20147149]), 36, 0)
cost(result[0],X,y)
0.20349770158947425

1.5画出决策曲线

plotting_x1 =np.linspace(30,100,100)
plotting_h1 =(-result[0][0]-result[0][1]*plotting_x1)/result[0][2]
# θ0+θ1x+θ2y=0
fig,ax= plt.subplots(figsize=(12,8))
ax.plot(plotting_x1,plotting_h1,'y',label='prediction')
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()

在这里插入图片描述

1.5计算acc

def hfunc1(theta,X):
    return sigmoid(np.sum(X*theta))
hfunc1(result[0],[1,45,85])
0.776290625526598
def predict(theta,X):
    probability =sigmoid(X*theta.T)
    return [1 if x>=0.5 else 0 for x in probability]

theta_min=np.matrix(result[0])
predictions=predict(theta_min,X)
correct=[1 if((a==1 and b==1)or (a==0 and b==0))
         else 0 for(a,b)in zip(predictions,y)]
# print(sum(map(int ,correct)))
# print(len(correct))
acc=(sum(map(int ,correct))/len(correct)*100)
print('acc={0}%'.format(acc))
acc=89.0%

2.正则化逻辑回归

本项目在逻辑回归的基础上加入了正则化项

2.1查看数据分布

path=r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex2\machine-learning-ex2\ex2\ex2data2.txt'
data_init=pd.read_csv(path,header=None,names=['Test 1', 'Test 2', 'Accepted'])
data_init.head()
Test 1Test 2Accepted
00.0512670.699561
1-0.0927420.684941
2-0.2137100.692251
3-0.3750000.502191
4-0.5132500.465641
postive2=data_init[data_init['Accepted'].isin([1])]
negative2=data_init[data_init['Accepted'].isin([0])]

fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(postive2['Test 1'],postive2['Test 2'],s=50,c='b',marker='o',label='Accepted')
ax.scatter(negative2['Test 1'],negative2['Test 2'],s=50,c='r',marker='x',label=' Not Accepted')

<matplotlib.collections.PathCollection at 0x282362a32b0>

在这里插入图片描述

2.2添加更多的特征

degree=6
data2=data_init
x1=data2['Test 1']
x2=data2['Test 2']
data2.insert(3,'Ones',1)
for i in range(1,degree+1):
    for j in range(0,i+1):
        data2['F'+str(i-j)+str(j)]=np.power(x1,i-j)*np.power(x2,j)
        
data2.drop('Test 1',axis=1,inplace=True)
data2.drop('Test 2',axis=1,inplace=True)
data2.head()
AcceptedOnesF10F01F20F11F02F30F21F12...F23F14F05F60F51F42F33F24F15F06
0110.0512670.699560.0026280.0358640.4893840.0001350.0018390.025089...0.0009000.0122780.1675421.815630e-082.477505e-070.0000030.0000460.0006290.0085890.117206
111-0.0927420.684940.008601-0.0635230.469143-0.0007980.005891-0.043509...0.002764-0.0204120.1507526.362953e-07-4.699318e-060.000035-0.0002560.001893-0.0139810.103256
211-0.2137100.692250.045672-0.1479410.479210-0.0097610.031616-0.102412...0.015151-0.0490770.1589709.526844e-05-3.085938e-040.001000-0.0032380.010488-0.0339730.110047
311-0.3750000.502190.140625-0.1883210.252195-0.0527340.070620-0.094573...0.017810-0.0238510.0319402.780914e-03-3.724126e-030.004987-0.0066790.008944-0.0119780.016040
411-0.5132500.465640.263426-0.2389900.216821-0.1352030.122661-0.111283...0.026596-0.0241280.0218901.827990e-02-1.658422e-020.015046-0.0136500.012384-0.0112350.010193

5 rows × 29 columns

2.2定义代价函数和梯度

代价函数
在这里插入图片描述

def costReg(theta,X,y,learningRate):
    theta=np.matrix(theta)
    X=np.matrix(X)
    y=np.matrix(y)
    first=np.multiply(-y,np.log(sigmoid(X*theta.T)))
    second=np.multiply((1-y),np.log(1-sigmoid(X*theta.T)))
    reg=(learningRate/(2*len(X)))*np.sum(np.power(theta[:,1:],2))
    return np.sum(first-second)/len(X)+reg

梯度,分theta是否等于0
在这里插入图片描述

def gradientReg(theta, X, y, learningRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        
        if (i == 0):
            grad[i] = np.sum(term) / len(X)
        else:
            grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
    return grad

2.3处理数据格式

cols=data2.shape[1]
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
theta2 = np.zeros(cols-1)

X2=np.array(X2.values)
y2=np.array(y2.values)

learningRate=1

2.4 利用库进行预测

costReg(theta2,X2,y2,learningRate)
0.6931471805599454
result2=opt.fmin_tnc(func=costReg,x0=theta2,fprime=gradientReg, args=(X2, y2, learningRate))
result2
(array([ 1.27271027,  0.62529965,  1.18111686, -2.01987399, -0.91743189,
        -1.43166929,  0.12393228, -0.36553118, -0.35725403, -0.17516291,
        -1.45817009, -0.05098418, -0.61558554, -0.27469165, -1.19271298,
        -0.24217841, -0.20603298, -0.04466178, -0.27778951, -0.29539513,
        -0.45645981, -1.04319155,  0.02779373, -0.2924487 ,  0.0155576 ,
        -0.32742405, -0.1438915 , -0.92467487]), 32, 1)
theta_min = np.matrix(result2[0])
predictions = predict(theta_min, X2)
correct = [1 if ((a == b)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) /len(correct))*100
print(sum(map(int, correct)))
print ('accuracy = {0}%'.format(accuracy))
98
accuracy = 83.05084745762711%

2.4 决策曲线

def hfunc2(theta, x1, x2):
    temp = theta[0][0]
    place = 0
    for i in range(1, degree+1):
        for j in range(0, i+1):
            temp+= np.power(x1, i-j) * np.power(x2, j) * theta[0][place+1]
            place+=1
    return temp
    
def find_decision_boundary(theta):
    t1=np.linspace(-1,1.5,1000)
    t2=np.linspace(-1,1.5,1000)
    cordinates=[(x,y)for x in t1 for y in t2]
    x_cord,y_cord=zip(*cordinates)
    h_val=pd.DataFrame({'x1':x_cord,'x2':y_cord})
    h_val['hval']=hfunc2(theta, h_val['x1'], h_val['x2'])
#     print(h_val)
    decison=h_val[np.abs(sigmoid(h_val['hval'])-0.5)<0.01]
    return decison.x1,decison.x2
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(postive2['Test 1'], postive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')

x, y = find_decision_boundary(result2)
plt.scatter(x, y, c='y', s=10, label='Prediction')
ax.legend()
plt.show()

λ=100,欠拟合
在这里插入图片描述

λ=0,过拟合

在这里插入图片描述

λ=1,拟合得比较好

在这里插入图片描述

总结

  1. theta0不需要正则化
  2. zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表。zip() 和 * 操作符一起操作可以用来 unzip 一个列表
  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值