逻辑回归
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告
准备数据
data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()#看前五行
exam1 | exam2 | admitted | |
---|---|---|---|
0 | 34.623660 | 78.024693 | 0 |
1 | 30.286711 | 43.894998 | 0 |
2 | 35.847409 | 72.902198 | 0 |
3 | 60.182599 | 86.308552 | 1 |
4 | 79.032736 | 75.344376 | 1 |
data.describe()
exam1 | exam2 | admitted | |
---|---|---|---|
count | 100.000000 | 100.000000 | 100.000000 |
mean | 65.644274 | 66.221998 | 0.600000 |
std | 19.458222 | 18.582783 | 0.492366 |
min | 30.058822 | 30.603263 | 0.000000 |
25% | 50.919511 | 48.179205 | 0.000000 |
50% | 67.032988 | 67.682381 | 1.000000 |
75% | 80.212529 | 79.360605 | 1.000000 |
max | 99.827858 | 98.869436 | 1.000000 |
sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))
sns.lmplot(x='exam1', y='exam2', hue='admitted', data=data,
height=6,
fit_reg=False,
scatter_kws={"s": 50}
)
plt.show()#看下数据的样子
def get_X(df):#读取特征
# """
# use concat to add intersect feature to avoid side effect
# not efficient for big dataset though
# """
ones = pd.DataFrame({'ones': np.ones(len(df))})#ones是m行1列的dataframe
data = pd.concat([ones, df], axis=1) # 合并数据,根据列合并
return data.iloc[:, :-1].values # 这个操作返回 ndarray,不是矩阵
def get_y(df):#读取标签
# '''assume the last column is the target'''
return np.array(df.iloc[:, -1])#df.iloc[:, -1]是指df的最后一列
def normalize_feature(df):
# """Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())#特征缩放
X = get_X(data)
print(X.shape)
y = get_y(data)
print(y.shape)
(100, 3)
(100,)
sigmoid 函数
g 代表一个常用的逻辑函数(logistic function)为S形函数(Sigmoid function),公式为:
g
(
z
)
=
1
1
+
e
−
z
g\left( z \right)=\frac{1}{1+{{e}^{-z}}}
g(z)=1+e−z1
合起来,我们得到逻辑回归模型的假设函数:
h
θ
(
x
)
=
1
1
+
e
−
θ
T
X
{{h}_{\theta }}\left( x \right)=\frac{1}{1+{{e}^{-{{\theta }^{T}}X}}}
hθ(x)=1+e−θTX1
def sigmoid(z):
# your code here (appro ~ 1 lines)
gz = 1 / (1 + np.exp(-z))
return gz
下面程序会调用上面你写好的函数,并画出sigmoid函数图像。如果你的程序正确,你应该能在下方看到函数图像。
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(np.arange(-10, 10, step=0.01),
sigmoid(np.arange(-10, 10, step=0.01)))
ax.set_ylim((-0.1,1.1))
ax.set_xlabel('z', fontsize=18)
ax.set_ylabel('g(z)', fontsize=18)
ax.set_title('sigmoid function', fontsize=18)
plt.show()
cost function(代价函数)
- m a x ( ℓ ( θ ) ) = m i n ( − ℓ ( θ ) ) max(\ell(\theta)) = min(-\ell(\theta)) max(ℓ(θ))=min(−ℓ(θ))
- choose − ℓ ( θ ) -\ell(\theta) −ℓ(θ) as the cost function
J ( θ ) = − 1 m ∑ i = 1 m [ y ( i ) log ( h θ ( x ( i ) ) ) + ( 1 − y ( i ) ) log ( 1 − h θ ( x ( i ) ) ) ] = 1 m ∑ i = 1 m [ − y ( i ) log ( h θ ( x ( i ) ) ) − ( 1 − y ( i ) ) log ( 1 − h θ ( x ( i ) ) ) ] J\left( \theta \right)=-\frac{1}{m}\sum\limits_{i=1}^{m}{[{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)+\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]} =\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]} J(θ)=−m1i=1∑m[y(i)log(hθ(x(i)))+(1−y(i))log(1−hθ(x(i)))]=m1i=1∑m[−y(i)log(hθ(x(i)))−(1−y(i))log(1−hθ(x(i)))]
theta = theta=np.zeros(3) # X(m*n) so theta is n*1
theta
array([0., 0., 0.])
def cost(theta, X, y):
''' cost fn is -l(theta) for you to minimize'''
# your code here (appro ~ 2 lines)
h = sigmoid(np.dot(theta.T, X.T))
m = len(y)
costf = (np.dot(-y, np.log(h)) - np.dot(1 - y, np.log(1 - h))) / m
return costf
# Hint:X @ theta与X.dot(theta)等价
cost(theta, X, y)
0.6931471805599453
gradient descent(梯度下降)
- 这是批量梯度下降(batch gradient descent)
- 转化为向量化计算:
1
m
X
T
(
S
i
g
m
o
i
d
(
X
θ
)
−
y
)
\frac{1}{m} X^T( Sigmoid(X\theta) - y )
m1XT(Sigmoid(Xθ)−y)
∂ J ( θ ) ∂ θ j = 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) \frac{\partial J\left( \theta \right)}{\partial {{\theta }_{j}}}=\frac{1}{m}\sum\limits_{i=1}^{m}{({{h}_{\theta }}\left( {{x}^{(i)}} \right)-{{y}^{(i)}})x_{_{j}}^{(i)}} ∂θj∂J(θ)=m1i=1∑m(hθ(x(i))−y(i))xj(i)
def gradient(theta, X, y):
# your code here (appro ~ 2 lines)
h = sigmoid(np.dot(X, theta))
m = len(y)
grad = np.dot(X.T, h - y) / m
return grad
gradient(theta, X, y)
array([ -0.1 , -12.00921659, -11.26284221])
拟合参数
- 这里我使用
scipy.optimize.minimize
去寻找参数
import scipy.optimize as opt
res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)
print(res)
fun: 0.20355713617495905
jac: array([ 0.00014506, -0.00324422, -0.00261146])
message: 'Optimization terminated successfully.'
nfev: 63
nhev: 0
nit: 25
njev: 160
status: 0
success: True
x: array([-24.53591642, 0.20123168, 0.19640851])
用训练集预测和验证
def predict(x, theta):
# your code here (appro ~ 2 lines)
y_pred = sigmoid(X @ theta)
return (y_pred >= 0.5).astype(int)
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.87 0.85 0.86 40
1 0.90 0.92 0.91 60
accuracy 0.89 100
macro avg 0.89 0.88 0.88 100
weighted avg 0.89 0.89 0.89 100
寻找决策边界
http://stats.stackexchange.com/questions/93569/why-is-logistic-regression-a-linear-classifier
X × θ = 0 X \times \theta = 0 X×θ=0 (this is the line)
print(res.x) # this is final theta
[-24.53591642 0.20123168 0.19640851]
coef = -(res.x / res.x[2]) # find the equation
print(coef)
x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x
[124.92287633 -1.02455683 -1. ]
data.describe() # find the range of x and y
exam1 | exam2 | admitted | |
---|---|---|---|
count | 100.000000 | 100.000000 | 100.000000 |
mean | 65.644274 | 66.221998 | 0.600000 |
std | 19.458222 | 18.582783 | 0.492366 |
min | 30.058822 | 30.603263 | 0.000000 |
25% | 50.919511 | 48.179205 | 0.000000 |
50% | 67.032988 | 67.682381 | 1.000000 |
75% | 80.212529 | 79.360605 | 1.000000 |
max | 99.827858 | 98.869436 | 1.000000 |
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot(x='exam1', y='exam2', hue='admitted', data=data,
height=6,
fit_reg=False,
scatter_kws={"s": 25}
)
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()
3- 正则化逻辑回归
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()
test1 | test2 | accepted | |
---|---|---|---|
0 | 0.051267 | 0.69956 | 1 |
1 | -0.092742 | 0.68494 | 1 |
2 | -0.213710 | 0.69225 | 1 |
3 | -0.375000 | 0.50219 | 1 |
4 | -0.513250 | 0.46564 | 1 |
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot(x='test1', y='test2', hue='accepted', data=df,
height=6,
fit_reg=False,
scatter_kws={"s": 50}
)
plt.title('Regularized Logistic Regression')
plt.show()
feature mapping(特征映射)
polynomial expansion
for i in 0..i
for p in 0..i:
output x^(i-p) * y^p
def feature_mapping(x, y, power, as_ndarray=False):
# """return mapped features as ndarray or dataframe"""
data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
for i in np.arange(power + 1)
for p in np.arange(i + 1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)
x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1, x2, power=6)
print(data.shape)
data.head()
(118, 28)
f00 | f10 | f01 | f20 | f11 | f02 | f30 | f21 | f12 | f03 | ... | f23 | f14 | f05 | f60 | f51 | f42 | f33 | f24 | f15 | f06 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.051267 | 0.69956 | 0.002628 | 0.035864 | 0.489384 | 0.000135 | 0.001839 | 0.025089 | 0.342354 | ... | 0.000900 | 0.012278 | 0.167542 | 1.815630e-08 | 2.477505e-07 | 0.000003 | 0.000046 | 0.000629 | 0.008589 | 0.117206 |
1 | 1.0 | -0.092742 | 0.68494 | 0.008601 | -0.063523 | 0.469143 | -0.000798 | 0.005891 | -0.043509 | 0.321335 | ... | 0.002764 | -0.020412 | 0.150752 | 6.362953e-07 | -4.699318e-06 | 0.000035 | -0.000256 | 0.001893 | -0.013981 | 0.103256 |
2 | 1.0 | -0.213710 | 0.69225 | 0.045672 | -0.147941 | 0.479210 | -0.009761 | 0.031616 | -0.102412 | 0.331733 | ... | 0.015151 | -0.049077 | 0.158970 | 9.526844e-05 | -3.085938e-04 | 0.001000 | -0.003238 | 0.010488 | -0.033973 | 0.110047 |
3 | 1.0 | -0.375000 | 0.50219 | 0.140625 | -0.188321 | 0.252195 | -0.052734 | 0.070620 | -0.094573 | 0.126650 | ... | 0.017810 | -0.023851 | 0.031940 | 2.780914e-03 | -3.724126e-03 | 0.004987 | -0.006679 | 0.008944 | -0.011978 | 0.016040 |
4 | 1.0 | -0.513250 | 0.46564 | 0.263426 | -0.238990 | 0.216821 | -0.135203 | 0.122661 | -0.111283 | 0.100960 | ... | 0.026596 | -0.024128 | 0.021890 | 1.827990e-02 | -1.658422e-02 | 0.015046 | -0.013650 | 0.012384 | -0.011235 | 0.010193 |
5 rows × 28 columns
data.describe()
f00 | f10 | f01 | f20 | f11 | f02 | f30 | f21 | f12 | f03 | ... | f23 | f14 | f05 | f60 | f51 | f42 | f33 | f24 | f15 | f06 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 118.0 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 1.180000e+02 | 118.000000 | 118.000000 | 118.000000 | ... | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 | 118.000000 | 1.180000e+02 |
mean | 1.0 | 0.054779 | 0.183102 | 0.247575 | -0.025472 | 0.301370 | 5.983333e-02 | 0.030682 | 0.015483 | 0.142350 | ... | 0.018278 | 4.089084e-03 | 0.115710 | 7.837118e-02 | -0.000703 | 1.893340e-02 | -0.001705 | 2.259170e-02 | -0.006302 | 1.257256e-01 |
std | 0.0 | 0.496654 | 0.519743 | 0.248532 | 0.224075 | 0.284536 | 2.746459e-01 | 0.134706 | 0.150143 | 0.326134 | ... | 0.058513 | 9.993907e-02 | 0.299092 | 1.938621e-01 | 0.058271 | 3.430092e-02 | 0.037443 | 4.346935e-02 | 0.090621 | 2.964416e-01 |
min | 1.0 | -0.830070 | -0.769740 | 0.000040 | -0.484096 | 0.000026 | -5.719317e-01 | -0.358121 | -0.483743 | -0.456071 | ... | -0.142660 | -4.830370e-01 | -0.270222 | 6.472253e-14 | -0.203971 | 2.577297e-10 | -0.113448 | 2.418097e-10 | -0.482684 | 1.795116e-14 |
25% | 1.0 | -0.372120 | -0.254385 | 0.043243 | -0.178209 | 0.061086 | -5.155632e-02 | -0.023672 | -0.042980 | -0.016492 | ... | -0.001400 | -7.449462e-03 | -0.001072 | 8.086369e-05 | -0.006381 | 1.258285e-04 | -0.005749 | 3.528590e-04 | -0.016662 | 2.298277e-04 |
50% | 1.0 | -0.006336 | 0.213455 | 0.165397 | -0.016521 | 0.252195 | -2.544062e-07 | 0.006603 | -0.000039 | 0.009734 | ... | 0.001026 | -8.972096e-09 | 0.000444 | 4.527344e-03 | -0.000004 | 3.387050e-03 | -0.000005 | 3.921378e-03 | -0.000020 | 1.604015e-02 |
75% | 1.0 | 0.478970 | 0.646563 | 0.389925 | 0.100795 | 0.464189 | 1.099616e-01 | 0.086392 | 0.079510 | 0.270310 | ... | 0.021148 | 2.751341e-02 | 0.113020 | 5.932959e-02 | 0.002104 | 2.090875e-02 | 0.001024 | 2.103622e-02 | 0.001289 | 1.001215e-01 |
max | 1.0 | 1.070900 | 1.108900 | 1.146827 | 0.568307 | 1.229659 | 1.228137e+00 | 0.449251 | 0.505577 | 1.363569 | ... | 0.287323 | 4.012965e-01 | 1.676725 | 1.508320e+00 | 0.250577 | 2.018260e-01 | 0.183548 | 2.556084e-01 | 0.436209 | 1.859321e+00 |
8 rows × 28 columns
regularized cost(正则化代价函数)
J ( θ ) = 1 m ∑ i = 1 m [ − y ( i ) log ( h θ ( x ( i ) ) ) − ( 1 − y ( i ) ) log ( 1 − h θ ( x ( i ) ) ) ] + λ 2 m ∑ j = 1 n θ j 2 J\left( \theta \right)=\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]}+\frac{\lambda }{2m}\sum\limits_{j=1}^{n}{\theta _{j}^{2}} J(θ)=m1i=1∑m[−y(i)log(hθ(x(i)))−(1−y(i))log(1−hθ(x(i)))]+2mλj=1∑nθj2
theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)
y = get_y(df)
print(y.shape)
(118, 28)
(118,)
def regularized_cost(theta, X, y, l=1):
# your code here (appro ~ 3 lines
m = len(y)
regu_cost = cost(theta, X, y) + l / (2 * m) * np.sum(np.power(theta, 2))
return regu_cost
regularized_cost(theta, X, y, l=1)
0.6931471805599454
因为我们设置theta为0,所以这个正则化代价函数与代价函数的值应该相同
regularized gradient(正则化梯度)
∂ J ( θ ) ∂ θ j = ( 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) ) + λ m θ j for j ≥ 1 \frac{\partial J\left( \theta \right)}{\partial {{\theta }_{j}}}=\left( \frac{1}{m}\sum\limits_{i=1}^{m}{\left( {{h}_{\theta }}\left( {{x}^{\left( i \right)}} \right)-{{y}^{\left( i \right)}} \right)} \right)+\frac{\lambda }{m}{{\theta }_{j}}\text{ }\text{ for j}\ge \text{1} ∂θj∂J(θ)=(m1i=1∑m(hθ(x(i))−y(i)))+mλθj for j≥1
def regularized_gradient(theta, X, y, l=1):
# your code here (appro ~ 3 lines)
m = len(y)
regularized_term = l / m * theta
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02])
拟合参数
import scipy.optimize as opt
print('init cost = {}'.format(regularized_cost(theta, X, y)))
res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res
init cost = 0.6931471805599454
fun: 0.5351602503809838
jac: array([-4.40953967e-08, 4.80421622e-09, -2.93966980e-09, -8.45363795e-09,
1.70329722e-08, 3.38119611e-08, -2.76831472e-09, -8.60782540e-09,
1.46305718e-09, 7.94770647e-09, -1.06087740e-08, 7.86887349e-10,
-6.42298185e-09, 7.96794764e-09, 2.02837825e-08, -5.31013168e-10,
-6.81811705e-09, 1.97820237e-09, -5.09429939e-09, -3.89325855e-11,
6.80254916e-09, -6.95320736e-09, -7.24708972e-10, -1.04093445e-09,
1.35729125e-09, -2.03967601e-09, 1.65029609e-09, 9.68433032e-09])
message: 'Optimization terminated successfully.'
nfev: 7
nhev: 0
nit: 6
njev: 53
status: 0
success: True
x: array([ 1.14213418, 0.60132066, 1.16718272, -1.87174547, -0.91573325,
-1.26952692, 0.12668314, -0.36873143, -0.34518464, -0.17377595,
-1.42386031, -0.04855655, -0.6064203 , -0.26931816, -1.16315916,
-0.24310139, -0.20707435, -0.04318408, -0.28028167, -0.28695652,
-0.46910528, -1.03618073, 0.02923477, -0.29262232, 0.0173672 ,
-0.32897267, -0.13795895, -0.93199012])
预测
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))
precision recall f1-score support
0 0.88 0.75 0.81 60
1 0.78 0.90 0.83 58
accuracy 0.82 118
macro avg 0.83 0.82 0.82 118
weighted avg 0.83 0.82 0.82 118
使用不同的 λ \lambda λ (这个是常数)
画出决策边界
- 我们找到所有满足 X × θ = 0 X\times \theta = 0 X×θ=0 的x
- instead of solving polynomial equation, just create a coridate x,y grid that is dense enough, and find all those X × θ X\times \theta X×θ that is close enough to 0, then plot them
def draw_boundary(power, l):
# """
# power: polynomial power for mapped feature
# l: lambda constant
# """
density = 1000
threshhold = 2 * 10**-3
final_theta = feature_mapped_logistic_regression(power, l)
x, y = find_decision_boundary(density, power, final_theta, threshhold)
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
sns.lmplot(x='test1', y='test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 100})
plt.scatter(x, y, c='r', s=10)
plt.title('Decision boundary')
plt.show()
def feature_mapped_logistic_regression(power, l):
# """for drawing purpose only.. not a well generealize logistic regression
# power: int
# raise x1, x2 to polynomial power
# l: int
# lambda constant for regularization term
# """
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = get_y(df)
X = feature_mapping(x1, x2, power, as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=regularized_gradient)
final_theta = res.x
return final_theta
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.5, density)
t2 = np.linspace(-1, 1.5, density)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power) # this is a dataframe
inner_product = mapped_cord.values @ theta
decision = mapped_cord[np.abs(inner_product) < threshhold]
return decision.f10, decision.f01
#寻找决策边界函数
改变 λ \lambda λ的值,查看效果(选做)
draw_boundary(power=6, l=1) #set lambda = 1
draw_boundary(power=6, l=0.01) # set lambda < 0.1
draw_boundary(power=6, l=100) # set lambda > 10