import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
1.1 数据可视化
df=pd.read_csv('ex2data1.txt',header=None,names=['score 1',
'score 2 ',
'admission'])
df.head()
score 1 | score 2 | admission | |
---|---|---|---|
0 | 34.623660 | 78.024693 | 0 |
1 | 30.286711 | 43.894998 | 0 |
2 | 35.847409 | 72.902198 | 0 |
3 | 60.182599 | 86.308552 | 1 |
4 | 79.032736 | 75.344376 | 1 |
X1=df.iloc[:,0].values
X2=df.iloc[:,1].values
y=df.iloc[:,2].values
plt.scatter(X1[y==0],X2[y==0],c='y',marker='o')
plt.scatter(X1[y==1],X2[y==1],c='k',marker='^')
<matplotlib.collections.PathCollection at 0x1a9943536d0>
X=df.iloc[:,0:2].values
X=np.insert(X,0,1,axis=1)
X.shape
(100, 3)
y=df.iloc[:,-1].values
y.shape
(100,)
y=y.reshape(100,1)
y.shape
(100, 1)
1.2.1sigmiod实现
def sigmoid(z):
h=1/(1+np.exp(-z))
return h
#验证一下
h_0=sigmoid(0)
h_0
0.5
x=np.arange(-10,10,1)
h=sigmoid(x)
plt.plot(x,h,'r')
[<matplotlib.lines.Line2D at 0x1a994aef430>]
1.2.2损失函数和梯度下降
def cost(theta,X,y):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)
h=sigmoid(X*theta.reshape(3,1))
inner=-np.multiply(y,np.log(h))-np.multiply((1-y),np.log(1-h))
cost=np.sum(inner)/len(y)
return cost
theta=np.zeros(3)
cost_0=cost(theta=theta,X=X,y=y)
cost_0
0.6931471805599453
#损失函数梯度实现
def Gradient(theta,X,y):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)
grad_0=np.zeros(3)
for j in range(3):
h=sigmoid(X*theta.reshape(3,1))
term=np.multiply((h-y),X[:,j])
grad_0[j]=np.sum(term)/len(y)
return grad_0
theta=np.zeros(3)
grad_0=Gradient(theta=theta,X=X,y=y)
grad_0
array([ -0.1 , -12.00921659, -11.26284221])
theta=np.zeros(3)
1.2.3优化参数
scipy.optimize.minimize(fun, x0, args=(), method=None, jac=None, hess=None, hessp=None, bounds=None, constraints=(), tol=None, callback=None, options=None)
fun:最小化的目标函数
x0:最初的参数
args:多余的参数传递给目标函数和其导数
jac:计算梯度向量的方法
——————————
returns:
res:
import scipy.optimize as opt
res = opt.minimize(fun=cost, x0=theta, args=(X, y),
method='Newton-CG', jac=Gradient)
res
fun: 0.2034977018633035
jac: array([-2.12380382e-05, -1.40885753e-03, -1.27811598e-03])
message: 'Optimization terminated successfully.'
nfev: 72
nhev: 0
nit: 28
njev: 186
status: 0
success: True
x: array([-25.16007951, 0.20622062, 0.20146256])
final_theta=res.x
前面cost函数打错了导致无法最小化,一定要认真检查代码
1.2.4训练集预测和可视化边界
def predict(X,theta):
X=np.matrix(X)
theta=np.matrix(theta)
pred=sigmoid(X*theta.T)
return (pred>=0.5).astype(int)
y_pred=predict(X=X,theta=final_theta)
from sklearn.metrics import classification_report
print(classification_report(y,y_pred))
precision recall f1-score support
0 0.87 0.85 0.86 40
1 0.90 0.92 0.91 60
accuracy 0.89 100
macro avg 0.89 0.88 0.88 100
weighted avg 0.89 0.89 0.89 100
X1=df.iloc[:,0].values
X2=df.iloc[:,1].values
y=df.iloc[:,2].values
plt.scatter(X1[y==0],X2[y==0],c='y',marker='o')
plt.scatter(X1[y==1],X2[y==1],c='k',marker='^')
x1=range(30,100,1)
x2=-final_theta[0]/final_theta[2]+(-final_theta[1]/final_theta[2])*x1
plt.plot(x1,x2)
plt.xlabel('score 1')
plt.ylabel('score 2')
plt.show()
2正则化
df1=pd.read_csv('ex2data2.txt',header=None,names=['test1',
'test2',
'accepted'])
df1.head()
test1 | test2 | accepted | |
---|---|---|---|
0 | 0.051267 | 0.69956 | 1 |
1 | -0.092742 | 0.68494 | 1 |
2 | -0.213710 | 0.69225 | 1 |
3 | -0.375000 | 0.50219 | 1 |
4 | -0.513250 | 0.46564 | 1 |
2.1可视化
import seaborn as sns
sns.lmplot('test1','test2',hue='accepted',data=df1,fit_reg=False)
E:\Anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
<seaborn.axisgrid.FacetGrid at 0x1a996f65700>
X=df1.iloc[:,0:2].values
X=np.insert(X,0,1,axis=1)
X.shape
(118, 3)
2.2特征映射
x1=df1['test1']
x2=df1['test2']
df2=df1.iloc[:,2].copy()
df2=pd.DataFrame(df2)
df2.head()
accepted | |
---|---|
0 | 1 |
1 | 1 |
2 | 1 |
3 | 1 |
4 | 1 |
for i in range(6):
for j in range(i+1):
df2['F'+str(i)+str(j)]=np.power(x1,i)*np.power(x2,j)
df2.head()
accepted | F00 | F10 | F11 | F20 | F21 | F22 | F30 | F31 | F32 | ... | F41 | F42 | F43 | F44 | F50 | F51 | F52 | F53 | F54 | F55 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1.0 | 0.051267 | 0.035864 | 0.002628 | 0.001839 | 0.001286 | 0.000135 | 0.000094 | 0.000066 | ... | 0.000005 | 0.000003 | 0.000002 | 0.000002 | 3.541519e-07 | 2.477505e-07 | 1.733163e-07 | 1.212452e-07 | 8.481827e-08 | 5.933547e-08 |
1 | 1 | 1.0 | -0.092742 | -0.063523 | 0.008601 | 0.005891 | 0.004035 | -0.000798 | -0.000546 | -0.000374 | ... | 0.000051 | 0.000035 | 0.000024 | 0.000016 | -6.860919e-06 | -4.699318e-06 | -3.218751e-06 | -2.204651e-06 | -1.510054e-06 | -1.034296e-06 |
2 | 1 | 1.0 | -0.213710 | -0.147941 | 0.045672 | 0.031616 | 0.021886 | -0.009761 | -0.006757 | -0.004677 | ... | 0.001444 | 0.001000 | 0.000692 | 0.000479 | -4.457837e-04 | -3.085938e-04 | -2.136241e-04 | -1.478813e-04 | -1.023708e-04 | -7.086618e-05 |
3 | 1 | 1.0 | -0.375000 | -0.188321 | 0.140625 | 0.070620 | 0.035465 | -0.052734 | -0.026483 | -0.013299 | ... | 0.009931 | 0.004987 | 0.002505 | 0.001258 | -7.415771e-03 | -3.724126e-03 | -1.870219e-03 | -9.392053e-04 | -4.716595e-04 | -2.368627e-04 |
4 | 1 | 1.0 | -0.513250 | -0.238990 | 0.263426 | 0.122661 | 0.057116 | -0.135203 | -0.062956 | -0.029315 | ... | 0.032312 | 0.015046 | 0.007006 | 0.003262 | -3.561597e-02 | -1.658422e-02 | -7.722277e-03 | -3.595801e-03 | -1.674349e-03 | -7.796437e-04 |
5 rows × 22 columns
X=df2.iloc[:,1:].values
X.shape
(118, 21)
y=df2.iloc[:,0].values.reshape((118,1))
y.shape
(118, 1)
theta=np.zeros(21)
2.3代价函数和梯度函数
def RegCost(theta,X,y,lam=1):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)#matrix是二维的,索引也是二维
h=sigmoid(X*theta.T)
inner=np.sum(-np.multiply(y,np.log(h))-np.multiply((1-y),np.log(1-h)))
reg=np.sum(np.power(theta[0,1:],2))*(lam/(2*y.size))
cost=inner/y.size+reg
return cost
cost_0=RegCost(theta=np.zeros(21),X=X,y=y,lam=1)
cost_0
0.6931471805599454
def RegGradient(theta,X,y,lam=1):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)
theta_term=np.zeros(21)
theta_term[0]=np.mean(np.multiply((sigmoid(X*theta.T)-y),X[:,0]))
for i in np.arange(1,theta.size):
theta_term[i]=np.mean(np.multiply((sigmoid(X*theta.T)-y),X[:,i]))+(lam/y.size)*theta[0,i]
return theta_term
theta_0=RegGradient(theta=np.zeros(21),X=X,y=y,lam=1)
theta_0
array([0.00847458, 0.01878809, 0.01150133, 0.05034464, 0.00732393,
0.01286005, 0.01835599, 0.00223924, 0.00338644, 0.0004085 ,
0.03934862, 0.00432983, 0.00631571, 0.0022186 , 0.00273346,
0.01997075, 0.0010974 , 0.00232501, 0.00023643, 0.00082851,
0.00014858])
len函数:
- len():返回对象的长度,注意不是length()函数
- len([1,2,3]),返回值为3
- len([[1,2,3],[3,4,5]]),返回值为2
np.size函数:
array中的元素数。
2.3.1最参优数
import scipy.optimize as opt
res = opt.minimize(fun=RegCost, x0=theta, args=(X, y),
method='Newton-CG', jac=RegGradient)
res
fun: 0.6221421403170583
jac: array([-1.87734013e-07, -2.06260386e-08, 2.10400267e-07, 2.58115072e-07,
3.95327254e-08, 9.94764237e-08, -3.83473311e-08, 8.54353401e-08,
6.19594052e-10, 1.02611418e-08, 1.85739142e-07, 4.94244557e-08,
2.07551944e-08, 2.89422126e-08, 2.17103864e-08, 2.47760803e-08,
3.85853035e-08, 1.02316448e-09, 1.05648222e-08, 8.67732081e-10,
8.53476468e-10])
message: 'Optimization terminated successfully.'
nfev: 6
nhev: 0
nit: 5
njev: 41
status: 0
success: True
x: array([ 0.52581305, 0.25862837, -0.64020577, -1.57231338, -0.17089508,
-0.94052761, 0.09077677, 0.01838949, -0.11369789, 0.06170887,
-1.32259036, -0.09367774, -0.36814802, -0.1297648 , -0.20163368,
-0.27777399, 0.05673225, -0.08174776, 0.02535854, -0.03365299,
0.01192529])
2.4绘出决策边界
final_theta_0=res.x
y_pred=predict(X,final_theta_0)
print(classification_report(y,y_pred))
precision recall f1-score support
0 0.71 0.58 0.64 60
1 0.64 0.76 0.69 58
accuracy 0.67 118
macro avg 0.68 0.67 0.67 118
weighted avg 0.68 0.67 0.67 118
效果不是很好