【机器学习】python实现逻辑回归模型(不直接调库)
在学习过程中练手,对模型的构建、收敛、调参印象更加深刻
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
import pandas as pd
#加载数据并简单划分为训练集/测试集
def load_dataset():
path='C:\Users\YU\Desktop\cancer.data'
df=pd.read_csv(path,header=None,sep=',')
f=np.array(df)
m=f.shape[0]
for i in range(f.shape[0]):
if f[i][10]==2:
f[i][10]=0
else:
f[i][10]=1
i=0
while(i<m):
for j in range(11):
if f[i][j]=='?':
f=np.delete(f,i,axis=0)#删除行
m=m-1
if i==f.shape[0]:
break
j=0
i=i+1
f=np.delete(f,0,axis=1)
dataset=np.zeros((f.shape[0],f.shape[1]))
dataset=f.astype(float)
#dataset=pd.DataFrame(dataset,dtype=np.float)
#dataset = np.array(dataset, dtype=float)
train_x,train_y=dataset[0:500,0:8],dataset[0:500,9]
test_x,test_y=dataset[500:-1,0:8],dataset[500:-1,9]
train_y=np.expand_dims(train_y,1)
test_y=np.expand_dims(test_y,1)
return train_x,train_y,test_x,test_y
#logit激活函数
def sigmoid(z):
s=1/(1+np.exp(-z))
return s
#权重初始化0
def initialize_with_zeros(dim):
w=np.zeros((dim,1))
b=0
assert(w.shape==(dim,1))
#assert函数用于判断表达式是否为真
assert(isinstance(b,float) or isinstance(b,int))
#判断w,b的类型
return w,b
#定义学习的目标函数,计算梯度
def propagate(w,b,X,Y):
m=X.shape[0]
A=sigmoid(np.dot(X,w)+b) #逻辑回归输出预测值
Y0=np.squeeze(Y,1)
cost=-1/m*np.sum(Y0*np.log(A)+(1-Y0)*np.log(1-A))
#计算权重w梯度
db=0
for i in range(m):
db=db+(A-Y)[i]
dw=(np.dot(X.T,(A-Y)))/m
db=db/m
db=np.squeeze(db,0)
assert(dw.shape==w.shape)
# assert(db.shape==b.shape)
cost=np.squeeze(cost)#降维
assert(cost.shape==())
grads={
"dw":dw,
"db":db
}
return grads,cost
#定义优化算法
def optimize(w,b,X,Y,num_iterations,learning_rate,print_cost):
costs=[]
for i in range(num_iterations):#梯度下降迭代优化
grads,cost=propagate(w,b,X,Y)
dw=grads['dw']
db=grads['db']
w=w-learning_rate*dw
b=b-learning_rate*db
if i%50==0:
costs.append(cost)
#append函数会在末尾添加
if print_cost and i%100==0:
print("cost after iteration %i:%f"%(i,cost))
params={
'w':w,
'b':b
}
grads={
'dw':dw,
'db':db
}
return params,grads,costs
#传入优化后的模型参数w,b,模型预测
def predict(w,b,X):
m=X.shape[0]
Y_prediction=np.zeros((m,1))
A=sigmoid(np.dot(X,w)+b)
for i in range(A.shape[0]):
if A[i,0]<=0.5:
Y_prediction[i,0]=0
else:
Y_prediction[i,0]=1
assert(Y_prediction.shape==(m,1))
return Y_prediction
def model(X_train,Y_train,X_test,Y_test,num_iterations,learning_rate,print_cost):
#初始化
w,b=initialize_with_zeros(X_train.shape[1])
#梯度下降优化模型参数
parameters,grads,costs=optimize(w,b,X_train,Y_train,num_iterations,learning_rate,print_cost)
w=parameters['w']
b=parameters['b']
#模型预测结果
Y_prediction_test=predict(w,b,X_test)
Y_prediction_train=predict(w,b,X_train)
#模型评估准确率
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
d = {"costs": costs,
"Y_prediction_test": Y_prediction_test,
"Y_prediction_train" : Y_prediction_train,
"w" : w,
"b" : b,
"learning_rate" : learning_rate,
"num_iterations": num_iterations}
return d
# 加载癌细胞数据集
#datasets = np.loadtxt("C:\Users\YU\Desktop\ex1data1.txt",delimiter=',')
train_set_x, train_set_y, test_set_x, test_set_y = load_dataset()
# reshape
train_set_x = train_set_x.reshape(train_set_x.shape[0], -1)
test_set_x = test_set_x.reshape(test_set_x.shape[0], -1)
#print(train_set_x.shape)
#print(test_set_x.shape)
#训练模型并评估准确率
paras = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 500, learning_rate = 0.001, print_cost = False)