1.1 Visualizing the data
数据文件中有三列,前两列为两门考试的成绩,第三列为是否被录取,1表示被录取,0表示没被录取。
第一个题目是将数据进行可视化。遇到的最大的困难就是将数据按照第三列进行分类,想过使用删除,但是计算不好循环的次数。想过使用for循环添加矩阵的行,但是查阅资料显示python会不断地创建一个新的矩阵内存,这样会非常低效,最终使用创建零矩阵,使用for循环将数据分类。
import numpy as np
import matplotlib.pyplot as plt
def readdata(datapath,num_feature):
ex=open(datapath,"r")
ex1=ex.readlines()
lens=len(ex1)
feature=np.zeros((lens,num_feature))
result=np.zeros((lens,1))
j=0
for i in ex1:
i=i.strip().split(",")
feature[j,:]=[float(i[0]),float(i[1])]
result[j]=float(i[2])
j+=1
ex.close()
return feature,result,lens
def visualize(feature,result,lens):
num_pos=np.count_nonzero(result)
num_neg=lens-num_pos
pos =np.zeros([num_pos,2])
neg=np.zeros([num_neg,2])
for i in range(lens):
if result[i]==1:
pos[num_pos-1,:]=feature[i,:]
num_pos-=1
else:
neg[num_neg-1,:]=feature[i,:]
num_neg-=1
plt.scatter(pos[:, 0], pos[:, 1],c='#FF69B4')
plt.scatter(neg[:, 0], neg[:, 1],c='000000')
plt.xlabel("exam1")
plt.ylabel("exam2")
plt.show()
datapath1="E:\exer\python\wex2\ex2data1.txt"
[feature1,result1,lens]=readdata(datapath1,2)
visualize(feature1,result1,lens)
1.2逻辑斯蒂回归(使用优化函数)
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt
#读数据
def readdata(datapath,num_feature):
ex=open(datapath,"r")
ex1=ex.readlines()
lens=len(ex1)
feature=np.zeros((lens,num_feature))
result=np.zeros((lens,1))
j=0
for i in ex1:
i=i.strip().split(",")
feature[j,:]=[float(i[0]),float(i[1])]
result[j]=float(i[2])
j+=1
ex.close()
return feature,result,lens
#可视化
def visualize(feature,result,lens):
num_pos=np.count_nonzero(result)
num_neg=lens-num_pos
pos =np.zeros([num_pos,2])
neg=np.zeros([num_neg,2])
for i in range(lens):
if result[i]==1:
pos[num_pos-1,:]=feature[i,:]
num_pos-=1
else:
neg[num_neg-1,:]=feature[i,:]
num_neg-=1
plt.scatter(pos[:, 0], pos[:, 1],c='#FF69B4')
plt.scatter(neg[:, 0], neg[:, 1],c='000000')
plt.xlabel("exam1")
plt.ylabel("exam2")
def costfunction(w,feature,result):
z = np.dot(w.T, feature)
A = 1 / (1 + np.exp(-z))
J0 =np.sum(-result*np.log(A)-(1-result)*np.log(1-A))/len(result.T)
return J0
def gradient(w,feature,result):
z = np.dot(w.T, feature)
A = 1 / (1 + np.exp(-z))
dz = A - result
dw = np.dot(feature, (dz.T)) /len(result.T)
return dw
#main
datapath1="E:\exer\python\wex2\ex2data1.txt"
[feature1,result1,lens]=readdata(datapath1,2)
visualize(feature1,result1,lens)
w=np.zeros([3,1])
B=np.ones((lens,1))
feature1=np.append(B,feature1,axis=1)
feature1=feature1.T
result1=result1.T
J1=[]
t=[]
gradient(w,feature1,result1)
J0=costfunction(w, feature1, result1)
result=opt.fmin_tnc(func=costfunction, x0=w, fprime=gradient, args=(feature1, result1))
print(result[0])
w=result[0]
#验证分割界面
x1=np.arange(20,100,1)
x2=(-w[0]-w[1]*x1)/w[2]
plt.plot(x1,x2)
plt.show()
#验证答案
x=[1,45,85]
z=np.dot(w.T,x)
A=1/(1+np.exp(-z))
print(A)
2.正则化逻辑回归
当w中系数全为0时,损失函数结果为
以下为代码:
其中最困难的部分是使用函数进行拟合时,无法运行,后来不断尝试,发现w必须要进行reshape才可以。
边界函数还没有画出来,以后会进行思考,暂时使用准确率来衡量模型,因为本次实验耗时太久了。
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt
from mpl_toolkits.mplot3d import Axes3D
#读数据
def readdata(datapath,num_feature):
ex=open(datapath,"r")
ex1=ex.readlines()
lens=len(ex1)
feature=np.zeros((lens,num_feature))
result=np.zeros((lens,1))
j=0
for i in ex1:
i=i.strip().split(",")
feature[j,:]=[float(i[0]),float(i[1])]
result[j]=float(i[2])
j+=1
ex.close()
return feature,result,lens
#可视化
def visualize(feature,result,lens):
num_pos=np.count_nonzero(result)
num_neg=lens-num_pos
pos =np.zeros([num_pos,2])
neg=np.zeros([num_neg,2])
for i in range(lens):
if result[i]==1:
pos[num_pos-1,:]=feature[i,:]
num_pos-=1
else:
neg[num_neg-1,:]=feature[i,:]
num_neg-=1
plt.scatter(pos[:, 0], pos[:, 1],c='#FF69B4')
plt.scatter(neg[:, 0], neg[:, 1],c='000000')
plt.xlabel("exam1")
plt.ylabel("exam2")
def map_feature6(feature,lens):
map_feature=np.ones((1,lens))
for i in range(1,7):
for j in range(0,i+1):
X=(feature[:,0]**j)*(feature[:,1]**(i-j))
map_feature=np.vstack((map_feature,X))
return map_feature
def costfunction(w,feature,result):
w = np.reshape(w, (feature.shape[0], 1))
k = 1
z = np.dot(w.T, feature)
A = 1 / (1 + np.exp(-z))
J0 =np.sum(-result*np.log(A)-(1-result)*np.log(1-A))/len(result.T)+k*(np.dot(w.T,w)-w[0]**2)/(2*len(result.T))
return J0
def gradient(w,feature,result):
k = 1
w= np.reshape(w, (feature.shape[0], 1))
z = np.dot(w.T, feature)
A = 1 / (1 + np.exp(-z))
dz = A - result
w1=w
w1[0]=0
dw = np.dot(feature, (dz.T)) /len(result.T) + k*w1/len(result.T)
return dw
#main
datapath1="E:\exer\python\wex2\ex2data2.txt"
[feature1,result1,lens]=readdata(datapath1,2)
visualize(feature1,result1,lens)
w=np.zeros([28,1])
map_feature=map_feature6(feature1,lens)
result1=result1.T
J0=costfunction(w, map_feature, result1)
gradient(w,map_feature,result1)
#result =opt.minimize(fun=costfunction, x0=w, args=(map_feature, result1), method='Newton-CG', jac=gradient)
result = opt.fmin_tnc(func=costfunction, x0=w, fprime=gradient, args=(map_feature, result1))
w=result[0]
w=np.mat([w]).T
#compute accuracy
z = np.dot(w.T, map_feature)
A = 1 / (1 + np.exp(-z))
A=A.T
num=0
for i in range(0,lens):
if (A[i]>0.5 and result1.T[i]==1) or A[i]<0.5 and result1.T[i]==0:
num+=1
accuracy=num/lens
print('Train Accuracy:',accuracy)
我参考了很多博主画边界线的方法,但都没有成功,请各位指教!!!!!!!