入门机器学习(六)--课后作业解析-logistics回归(python实现)

最新推荐文章于 2023-11-15 16:33:00 发布

爱吃骨头的猫、

最新推荐文章于 2023-11-15 16:33:00 发布

阅读量6k

点赞数 5

分类专栏： •机器学习基础文章标签：入门机器学习课后作业 logistics回归 python

本文链接：https://blog.csdn.net/qq_42580947/article/details/89460077

版权

•机器学习基础专栏收录该内容

21 篇文章 21 订阅

订阅专栏

编程作业2 logistic_regression（逻辑回归）

推荐运行环境：python 3.6

建立一个逻辑回归模型来预测一个学生是否被大学录取。根据两次考试的结果来决定每个申请人的录取机会。有以前的申请人的历史数据，可以用它作为逻辑回归的训练集

python实现逻辑回归目标：建立分类器（求解出三个参数 θ0 θ1 θ2）即得出分界线备注:θ1对应'Exam 1'成绩,θ2对应'Exam 2' 设定阈值，根据阈值判断录取结果备注:阈值指的是最终得到的概率值.将概率值转化成一个类别.一般是＞0.5是被录取了,＜0.5未被录取. 实现内容：

sigmoid : 映射到概率的函数 model : 返回预测结果值 cost : 根据参数计算损失 gradient : 计算每个参数的梯度方向 descent : 进行参数更新 accuracy: 计算精度

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告

准备数据

data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()#看前五行

data.describe()

sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2),color_codes=False) #设置样式参数,默认主题 darkgrid（灰色背景+白网格）,调色板 2色

sns.lmplot('exam1', 'exam2', hue='admitted', data=data,   
           size=6, 
           fit_reg=False,                         #fit_reg'参数，控制是否显示拟合的直线
           scatter_kws={"s": 50}
          )                                       #hue参数是将name所指定的不同类型的数据叠加在一张图中显示
plt.show()#看下数据的样子

def get_X(df):#读取特征
#     """
#     use concat to add intersect feature to avoid side effect
#     not efficient for big dataset though
#     """
    ones = pd.DataFrame({'ones': np.ones(len(df))})#ones是m行1列的dataframe
    data = pd.concat([ones, df], axis=1)  # 合并数据，根据列合并 axis = 1的时候，concat就是行对齐，然后将不同列名称的两张表合并 加列
    return data.iloc[:, :-1].as_matrix()  # 这个操作返回 ndarray,不是矩阵


def get_y(df):#读取标签
#     '''assume the last column is the target'''
    return np.array(df.iloc[:, -1])#df.iloc[:, -1]是指df的最后一列


def normalize_feature(df):
#     """Applies function along input axis(default 0) of DataFrame."""
    return df.apply(lambda column: (column - column.mean()) / column.std())#特征缩放在逻辑回归同样适用
X = get_X(data)
y = get_y(data)

sigmoid 函数

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

cost function(代价函数)

theta = theta=np.zeros(3) # X(m*n) so theta is n*1
def cost(theta, X, y):
    ''' cost fn is -l(theta) for you to minimize'''
    # your code here  (appro ~ 2 lines)
    return np.mean(-y * np.log(sigmoid(X @ theta)) - (1 - y) * np.log(1 - sigmoid(X @ theta)))
# Hint:X @ theta与X.dot(theta)等价

gradient descent(梯度下降)

def gradient(theta, X, y):
    return (1 / len(X)) * X.T @ (sigmoid(X @ theta) - y)

拟合参数

import scipy.optimize as opt
res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)
print(res)

用训练集预测和验证

def predict(x, theta):
    # your code here  (appro ~ 2 lines)
    prob = sigmoid(x @ theta)
    return (prob >= 0.5).astype(int)   #实现变量类型转换
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))

寻找决策边界

coef = -(res.x / res.x[2])  # find the equation
x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x
sns.set(context="notebook", style="ticks", font_scale=1.5)  默认使用notebook上下文 主题 context可以设置输出图片的大小尺寸(scale)

sns.lmplot('exam1', 'exam2', hue='admitted', data=data, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 25}
          )

plt.plot(x, y, 'grey')
plt.xlim(0, 130) 
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()

3- 正则化逻辑回归

df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()

sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot('test1', 'test2', hue='accepted', data=df, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )

plt.title('Regularized Logistic Regression')
plt.show()

feature mapping（特征映射）

def feature_mapping(x, y, power, as_ndarray=False):
#     """return mapped features as ndarray or dataframe"""

    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                for i in np.arange(power + 1)
                for p in np.arange(i + 1)
            }

    if as_ndarray:
        return pd.DataFrame(data).as_matrix()
    else:
        return pd.DataFrame(data)

x1 = np.array(df.test1)
x2 = np.array(df.test2)

data = feature_mapping(x1, x2, power=6)
data.head()

regularized cost（正则化代价函数）

theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
y = get_y(df)

def regularized_cost(theta, X, y, l=1):
    # your code here  (appro ~ 3 lines
    theta_j1_to_n = theta[1:]
    regularized_term = (l / (2 * len(X))) * np.power(theta_j1_to_n, 2).sum()
    
    return  cost(theta, X, y) + regularized_term

regularized_cost(theta, X, y, l=1)

0.6931471805599454

因为我们设置theta为0，所以这个正则化代价函数与代价函数的值应该相同

regularized gradient(正则化梯度)

def regularized_gradient(theta, X, y, l=1):
    # your code here  (appro ~ 3 lines)
    theta_j1_to_n = theta[1:]      #不加theta0
    regularized_theta = (l / len(X)) * theta_j1_to_n
    
    regularized_term = np.concatenate([np.array([0]), regularized_theta])
    return gradient(theta, X, y) + regularized_term

regularized_gradient(theta, X, y)

拟合参数

import scipy.optimize as opt
print('init cost = {}'.format(regularized_cost(theta, X, y)))

res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res

预测

final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))

使用不同的?画出决策边界

def draw_boundary(power, l):
#     """
#     power: polynomial power for mapped feature
#     l: lambda constant
#     """
    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y, c='R', s=10)
    plt.title('Decision boundary')
    plt.show()

def feature_mapped_logistic_regression(power, l):
#     """for drawing purpose only.. not a well generealize logistic regression
#     power: int
#         raise x1, x2 to polynomial power
#     l: int
#         lambda constant for regularization term
#     """
    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y = get_y(df)

    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient)
    final_theta = res.x

    return final_theta

def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)  #1000个样本
    t2 = np.linspace(-1, 1.5, density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, power)  # this is a dataframe

    inner_product = mapped_cord.as_matrix() @ theta

    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.f10, decision.f01
#寻找决策边界函数

改变?λ的值，查看效果

draw_boundary(power=6, l=1)     #set lambda = 1

draw_boundary(power=6,l=0)  # set lambda < 0.1

draw_boundary(power=6, l=100)  # set lambda > 10

爱吃骨头的猫、

关注

5
点赞
踩
21

收藏

觉得还不错? 一键收藏
打赏
2
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录