逻辑回归

最新推荐文章于 2024-07-12 16:16:27 发布

keys_7

最新推荐文章于 2024-07-12 16:16:27 发布

阅读量203

点赞数

分类专栏：机器学习文章标签： python 机器学习逻辑回归过拟合

本文链接：https://blog.csdn.net/weixin_42911918/article/details/113433863

版权

机器学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

逻辑回归

准备数据
sigmoid 函数
cost function(代价函数)
gradient descent(梯度下降)
拟合参数
用训练集预测和验证
寻找决策边界
3- 正则化逻辑回归
feature mapping（特征映射）
regularized cost（正则化代价函数）
regularized gradient(正则化梯度)
拟合参数
预测
使用不同的 $\lambda$ （这个是常数）
画出决策边界
改变 $\lambda$ 的值，查看效果（选做）

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告

准备数据

data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()#看前五行

	exam1	exam2	admitted
0	34.623660	78.024693	0
1	30.286711	43.894998	0
2	35.847409	72.902198	0
3	60.182599	86.308552	1
4	79.032736	75.344376	1

data.describe()

	exam1	exam2	admitted
count	100.000000	100.000000	100.000000
mean	65.644274	66.221998	0.600000
std	19.458222	18.582783	0.492366
min	30.058822	30.603263	0.000000
25%	50.919511	48.179205	0.000000
50%	67.032988	67.682381	1.000000
75%	80.212529	79.360605	1.000000
max	99.827858	98.869436	1.000000

sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))

sns.lmplot(x='exam1', y='exam2', hue='admitted', data=data, 
           height=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )
plt.show()#看下数据的样子

def get_X(df):#读取特征
#     """
#     use concat to add intersect feature to avoid side effect
#     not efficient for big dataset though
#     """
    ones = pd.DataFrame({'ones': np.ones(len(df))})#ones是m行1列的dataframe
    data = pd.concat([ones, df], axis=1)  # 合并数据，根据列合并
    return data.iloc[:, :-1].values  # 这个操作返回 ndarray,不是矩阵


def get_y(df):#读取标签
#     '''assume the last column is the target'''
    return np.array(df.iloc[:, -1])#df.iloc[:, -1]是指df的最后一列


def normalize_feature(df):
#     """Applies function along input axis(default 0) of DataFrame."""
    return df.apply(lambda column: (column - column.mean()) / column.std())#特征缩放

X = get_X(data)
print(X.shape)

y = get_y(data)
print(y.shape)

(100, 3)
(100,)

sigmoid 函数

g 代表一个常用的逻辑函数（logistic function）为S形函数（Sigmoid function），公式为： $g\left( z \right)=\frac{1}{1+{{e}^{-z}}}$
合起来，我们得到逻辑回归模型的假设函数：
${{h}_{\theta }}\left( x \right)=\frac{1}{1+{{e}^{-{{\theta }^{T}}X}}}$

def sigmoid(z):
    # your code here  (appro ~ 1 lines)
    gz = 1 / (1 + np.exp(-z))
    return gz

下面程序会调用上面你写好的函数，并画出sigmoid函数图像。如果你的程序正确，你应该能在下方看到函数图像。

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(np.arange(-10, 10, step=0.01),
        sigmoid(np.arange(-10, 10, step=0.01)))
ax.set_ylim((-0.1,1.1))
ax.set_xlabel('z', fontsize=18)
ax.set_ylabel('g(z)', fontsize=18)
ax.set_title('sigmoid function', fontsize=18)
plt.show()

cost function(代价函数)

$max(\ell(\theta)) = min(-\ell(\theta))$
choose $-\ell(\theta)$ as the cost function

$J\left( \theta \right)=-\frac{1}{m}\sum\limits_{i=1}^{m}{[{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)+\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]} =\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]}$

theta = theta=np.zeros(3) # X(m*n) so theta is n*1
theta

array([0., 0., 0.])

def cost(theta, X, y):
    ''' cost fn is -l(theta) for you to minimize'''
    # your code here  (appro ~ 2 lines)
    h = sigmoid(np.dot(theta.T, X.T))
    m = len(y)
    costf = (np.dot(-y, np.log(h)) - np.dot(1 - y, np.log(1 - h))) / m
    return costf
# Hint:X @ theta与X.dot(theta)等价

cost(theta, X, y)

0.6931471805599453

gradient descent(梯度下降)

这是批量梯度下降（batch gradient descent）
转化为向量化计算： $\frac{1}{m} X^T( Sigmoid(X\theta) - y )$
$\frac{\partial J\left( \theta \right)}{\partial {{\theta }_{j}}}=\frac{1}{m}\sum\limits_{i=1}^{m}{({{h}_{\theta }}\left( {{x}^{(i)}} \right)-{{y}^{(i)}})x_{_{j}}^{(i)}}$

def gradient(theta, X, y):
    # your code here  (appro ~ 2 lines)
    h = sigmoid(np.dot(X, theta))
    m = len(y)
    grad = np.dot(X.T, h - y) / m
    return grad

gradient(theta, X, y)

array([ -0.1       , -12.00921659, -11.26284221])

拟合参数

这里我使用 scipy.optimize.minimize 去寻找参数

import scipy.optimize as opt

res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)

print(res)

     fun: 0.20355713617495905
     jac: array([ 0.00014506, -0.00324422, -0.00261146])
 message: 'Optimization terminated successfully.'
    nfev: 63
    nhev: 0
     nit: 25
    njev: 160
  status: 0
 success: True
       x: array([-24.53591642,   0.20123168,   0.19640851])

用训练集预测和验证

def predict(x, theta):
    # your code here  (appro ~ 2 lines)
    
    y_pred = sigmoid(X @ theta)
    return (y_pred >= 0.5).astype(int)

final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86        40
           1       0.90      0.92      0.91        60

    accuracy                           0.89       100
   macro avg       0.89      0.88      0.88       100
weighted avg       0.89      0.89      0.89       100

寻找决策边界

http://stats.stackexchange.com/questions/93569/why-is-logistic-regression-a-linear-classifier

$\times \theta = 0$ (this is the line)

print(res.x) # this is final theta

[-24.53591642   0.20123168   0.19640851]

coef = -(res.x / res.x[2])  # find the equation
print(coef)

x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x

[124.92287633  -1.02455683  -1.        ]

data.describe()  # find the range of x and y

	exam1	exam2	admitted
count	100.000000	100.000000	100.000000
mean	65.644274	66.221998	0.600000
std	19.458222	18.582783	0.492366
min	30.058822	30.603263	0.000000
25%	50.919511	48.179205	0.000000
50%	67.032988	67.682381	1.000000
75%	80.212529	79.360605	1.000000
max	99.827858	98.869436	1.000000

sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot(x='exam1', y='exam2', hue='admitted', data=data, 
           height=6, 
           fit_reg=False, 
           scatter_kws={"s": 25}
          )

plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()

在这里插入图片描述

3- 正则化逻辑回归

df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()

	test1	test2	accepted
0	0.051267	0.69956	1
1	-0.092742	0.68494	1
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot(x='test1', y='test2', hue='accepted', data=df, 
           height=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )

plt.title('Regularized Logistic Regression')
plt.show()

在这里插入图片描述

feature mapping（特征映射）

polynomial expansion

for i in 0..i
  for p in 0..i:
    output x^(i-p) * y^p

def feature_mapping(x, y, power, as_ndarray=False):
#     """return mapped features as ndarray or dataframe"""

    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                for i in np.arange(power + 1)
                for p in np.arange(i + 1)
            }

    if as_ndarray:
        return pd.DataFrame(data).values
    else:
        return pd.DataFrame(data)

x1 = np.array(df.test1)
x2 = np.array(df.test2)

data = feature_mapping(x1, x2, power=6)
print(data.shape)
data.head()

(118, 28)

	f00	f10	f01	f20	f11	f02	f30	f21	f12	f03	...	f23	f14	f05	f60	f51	f42	f33	f24	f15	f06
0	1.0	0.051267	0.69956	0.002628	0.035864	0.489384	0.000135	0.001839	0.025089	0.342354	...	0.000900	0.012278	0.167542	1.815630e-08	2.477505e-07	0.000003	0.000046	0.000629	0.008589	0.117206
1	1.0	-0.092742	0.68494	0.008601	-0.063523	0.469143	-0.000798	0.005891	-0.043509	0.321335	...	0.002764	-0.020412	0.150752	6.362953e-07	-4.699318e-06	0.000035	-0.000256	0.001893	-0.013981	0.103256
2	1.0	-0.213710	0.69225	0.045672	-0.147941	0.479210	-0.009761	0.031616	-0.102412	0.331733	...	0.015151	-0.049077	0.158970	9.526844e-05	-3.085938e-04	0.001000	-0.003238	0.010488	-0.033973	0.110047
3	1.0	-0.375000	0.50219	0.140625	-0.188321	0.252195	-0.052734	0.070620	-0.094573	0.126650	...	0.017810	-0.023851	0.031940	2.780914e-03	-3.724126e-03	0.004987	-0.006679	0.008944	-0.011978	0.016040
4	1.0	-0.513250	0.46564	0.263426	-0.238990	0.216821	-0.135203	0.122661	-0.111283	0.100960	...	0.026596	-0.024128	0.021890	1.827990e-02	-1.658422e-02	0.015046	-0.013650	0.012384	-0.011235	0.010193

5 rows × 28 columns

data.describe()

	f00	f10	f01	f20	f11	f02	f30	f21	f12	f03	...	f23	f14	f05	f60	f51	f42	f33	f24	f15	f06
count	118.0	118.000000	118.000000	118.000000	118.000000	118.000000	1.180000e+02	118.000000	118.000000	118.000000	...	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02	118.000000	1.180000e+02
mean	1.0	0.054779	0.183102	0.247575	-0.025472	0.301370	5.983333e-02	0.030682	0.015483	0.142350	...	0.018278	4.089084e-03	0.115710	7.837118e-02	-0.000703	1.893340e-02	-0.001705	2.259170e-02	-0.006302	1.257256e-01
std	0.0	0.496654	0.519743	0.248532	0.224075	0.284536	2.746459e-01	0.134706	0.150143	0.326134	...	0.058513	9.993907e-02	0.299092	1.938621e-01	0.058271	3.430092e-02	0.037443	4.346935e-02	0.090621	2.964416e-01
min	1.0	-0.830070	-0.769740	0.000040	-0.484096	0.000026	-5.719317e-01	-0.358121	-0.483743	-0.456071	...	-0.142660	-4.830370e-01	-0.270222	6.472253e-14	-0.203971	2.577297e-10	-0.113448	2.418097e-10	-0.482684	1.795116e-14
25%	1.0	-0.372120	-0.254385	0.043243	-0.178209	0.061086	-5.155632e-02	-0.023672	-0.042980	-0.016492	...	-0.001400	-7.449462e-03	-0.001072	8.086369e-05	-0.006381	1.258285e-04	-0.005749	3.528590e-04	-0.016662	2.298277e-04
50%	1.0	-0.006336	0.213455	0.165397	-0.016521	0.252195	-2.544062e-07	0.006603	-0.000039	0.009734	...	0.001026	-8.972096e-09	0.000444	4.527344e-03	-0.000004	3.387050e-03	-0.000005	3.921378e-03	-0.000020	1.604015e-02
75%	1.0	0.478970	0.646563	0.389925	0.100795	0.464189	1.099616e-01	0.086392	0.079510	0.270310	...	0.021148	2.751341e-02	0.113020	5.932959e-02	0.002104	2.090875e-02	0.001024	2.103622e-02	0.001289	1.001215e-01
max	1.0	1.070900	1.108900	1.146827	0.568307	1.229659	1.228137e+00	0.449251	0.505577	1.363569	...	0.287323	4.012965e-01	1.676725	1.508320e+00	0.250577	2.018260e-01	0.183548	2.556084e-01	0.436209	1.859321e+00

8 rows × 28 columns

regularized cost（正则化代价函数）

$J\left( \theta \right)=\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]}+\frac{\lambda }{2m}\sum\limits_{j=1}^{n}{\theta _{j}^{2}}$

theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)

y = get_y(df)
print(y.shape)

(118, 28)
(118,)

def regularized_cost(theta, X, y, l=1):
    # your code here  (appro ~ 3 lines
    
    m = len(y)
    regu_cost = cost(theta, X, y) + l / (2 * m) * np.sum(np.power(theta, 2))
    return regu_cost

regularized_cost(theta, X, y, l=1)

0.6931471805599454

因为我们设置theta为0，所以这个正则化代价函数与代价函数的值应该相同

regularized gradient(正则化梯度)

$\frac{\partial J\left( \theta \right)}{\partial {{\theta }_{j}}}=\left( \frac{1}{m}\sum\limits_{i=1}^{m}{\left( {{h}_{\theta }}\left( {{x}^{\left( i \right)}} \right)-{{y}^{\left( i \right)}} \right)} \right)+\frac{\lambda }{m}{{\theta }_{j}}\text{ }\text{ for j}\ge \text{1}$

def regularized_gradient(theta, X, y, l=1):
    # your code here  (appro ~ 3 lines)
    
    m = len(y)
    regularized_term = l / m * theta
    return gradient(theta, X, y) + regularized_term

regularized_gradient(theta, X, y)

array([8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
       1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
       8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
       1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
       4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
       3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
       4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02])

拟合参数

import scipy.optimize as opt

print('init cost = {}'.format(regularized_cost(theta, X, y)))

res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res

init cost = 0.6931471805599454
     fun: 0.5351602503809838
     jac: array([-4.40953967e-08,  4.80421622e-09, -2.93966980e-09, -8.45363795e-09,
        1.70329722e-08,  3.38119611e-08, -2.76831472e-09, -8.60782540e-09,
        1.46305718e-09,  7.94770647e-09, -1.06087740e-08,  7.86887349e-10,
       -6.42298185e-09,  7.96794764e-09,  2.02837825e-08, -5.31013168e-10,
       -6.81811705e-09,  1.97820237e-09, -5.09429939e-09, -3.89325855e-11,
        6.80254916e-09, -6.95320736e-09, -7.24708972e-10, -1.04093445e-09,
        1.35729125e-09, -2.03967601e-09,  1.65029609e-09,  9.68433032e-09])
 message: 'Optimization terminated successfully.'
    nfev: 7
    nhev: 0
     nit: 6
    njev: 53
  status: 0
 success: True
       x: array([ 1.14213418,  0.60132066,  1.16718272, -1.87174547, -0.91573325,
       -1.26952692,  0.12668314, -0.36873143, -0.34518464, -0.17377595,
       -1.42386031, -0.04855655, -0.6064203 , -0.26931816, -1.16315916,
       -0.24310139, -0.20707435, -0.04318408, -0.28028167, -0.28695652,
       -0.46910528, -1.03618073,  0.02923477, -0.29262232,  0.0173672 ,
       -0.32897267, -0.13795895, -0.93199012])

预测

final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81        60
           1       0.78      0.90      0.83        58

    accuracy                           0.82       118
   macro avg       0.83      0.82      0.82       118
weighted avg       0.83      0.82      0.82       118

使用不同的 $\lambda$ （这个是常数）

画出决策边界

我们找到所有满足 $X\times \theta = 0$ 的x
instead of solving polynomial equation, just create a coridate x,y grid that is dense enough, and find all those $X\times \theta$ that is close enough to 0, then plot them

def draw_boundary(power, l):
#     """
#     power: polynomial power for mapped feature
#     l: lambda constant
#     """
    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot(x='test1', y='test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y, c='r', s=10)
    plt.title('Decision boundary')
    plt.show()

def feature_mapped_logistic_regression(power, l):
#     """for drawing purpose only.. not a well generealize logistic regression
#     power: int
#         raise x1, x2 to polynomial power
#     l: int
#         lambda constant for regularization term
#     """
    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y = get_y(df)

    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient)
    final_theta = res.x

    return final_theta

def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)
    t2 = np.linspace(-1, 1.5, density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, power)  # this is a dataframe

    inner_product = mapped_cord.values @ theta

    decision = mapped_cord[np.abs(inner_product) < threshhold]
    return decision.f10, decision.f01
#寻找决策边界函数

改变 $\lambda$ 的值，查看效果（选做）

draw_boundary(power=6, l=1)     #set lambda = 1

draw_boundary(power=6, l=0.01)  # set lambda < 0.1

draw_boundary(power=6, l=100)  # set lambda > 10

在这里插入图片描述

源码github

keys_7

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
逻辑回归

逻辑回归import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsplt.style.use('fivethirtyeight') #样式美化import matplotlib.pyplot as pltfrom sklearn.metrics import classification_report#这个包是评价报告准备数据data = pd.read_csv('ex
复制链接

扫一扫