Logistic Regression(SGD)对数几率回归随机梯度下降Python实现
第一篇练手贴送给LR逻辑回归,请各位前辈多多指教
Algorithm
descend rule:
gradient:
probability function:
parameters
- k classes
- n training data point
- d dimension of features
- m batch size
- eta learning rate
- epsilon learning rate reduction criterion
- theta [W:b]
- linear reg yi=W·Xi+b
- X feature set
- y label set
functions
- logreg_predict_prob() : calculate the probability X[i] belong to class j
- loss() : the loss function to minimize
- gradient() : calculate the gradient
- log() : log likelihood set derivative to theta
- getBatches() : divide n training data points to m size batches
- logreg_fit() : parameter estimate, output W,b
- sigmoid() : sigmoid function
- logreg_predict_class() : testing function
- load() : load csv file and divide to X_train, y_train, (packaging by myself, limited in use)
Let’s start training
sigmoid function
def sigmoid(z):
return 1/(1+np.exp(-z))
probability function
input: X,y,theta
output: P : (n x k array)
def logreg_predict_prob(W,b,X):
n=len(X) #num of test data
k=len(W) #num of class
d=len(X[0]) #dimension of feature
theta=np.insert(W,len(W), values=b, axis=1) #append bias to theta
X_ba = np.insert(X,d,values=1, axis=1)
result=[]
for x in range(0,n): #iterate every X_ba
sum=0
for kk in range(1,k):
mu0=np.exp(np.dot(theta[kk].T,X_ba[x]))
sum+=mu0
P0=1/(1+sum)
templist=[P0] #initialize P0
for j in range(1,k):
mu1=np.exp(np.dot(theta[j].T,X_ba[x]))
P1=mu1/(1+sum)
templist.append(P1)
result.append(templist)
return np.array(result)
log-likelihood derivative function
input: X,y,c (class), i (index of yi) ,result(P)
output: log-likelihood derivation w.r.t. theta_c
def log(X,y,c,i,result):
d=len(X[0]) #dimension of feature
X_ba = np.insert(X,d,values=1, axis=1)
if y[i]==c:
delta=1
else:
delta=0
p=result[i,c]
log=(delta-p)*X_ba[i]
return log
Loss function
input: X,y,theta
output: loss
def loss(X,y,theta):
n=len(X) #num of test data
k=len(theta) #num of class
d=len(X[0]) #dimension of feature
W=theta[0:k,0:d]
b=theta[0:k,d:d+1].T
result=logreg_predict_prob(W,b,X)
sum=0
for i in range(0,n):
if result[i,y[i]]!=0:
sum+=round(float(np.log(result[i,y[i]])),8)
loss=((-1)*sum)/n
return loss
Gradient function
input: batch,X,y,c(class),eta(learning rate)
output: gradient
def gradient(batch,X,y,c,theta,eta): #calculate the gradient
n=len(batch) #batchsize
k=len(theta) #num of class
d=len(X) #dimension of feature
sum=np.zeros(len(X[0])+1)
W=theta[0:k,0:d]
b=theta[0:k,d:d+1].T
result=logreg_predict_prob(W,b,X)
for i in batch:
sum+=log(X,y,c,i,theta,result)
gradience=(-1)*eta*(((-1)*sum)/n)
return gradience
function of dividing batches
input: n(number of training data point),m(batch size)
output: batchList
leave the last batch out
def getBatches(n,m): #this function is to divide n samples into several batches with size of m or m+1
batchList=[]
x=list(range(0,n))
shuffle(x)
tempList = [];
for y in x:
if len(tempList)>=m:
batchList.append(tempList);
tempList=[y];
continue
tempList.append(y)
batchList.append(tempList)
return batchList
load csv file
input: file name
output: X, y
def load(filename): #load with dividing to feature and label
f = open(filename, 'r') # 打开文件
print(filename)
s = f.readlines() # read in lines
featureList = []
labelList=[]
for line in s:
line = line.strip() # throw' ' & '/n'
strList = line.split(',') # list feature
for k in range(0,len(strList)-1):
strList[k] = float(strList[k])
strList[-1]=int(strList[-1])
label=strList.pop() #label
featureList.append(strList)
labelList.append(label)
f.close()
return featureList,labelList
SGD function
input: X,y,m,eta_start,eta_end,epsilon,max_epoch
output: W,b
def logreg_fit(X,y,m,eta_start,eta_end,epsilon,max_epoch=1000):
count=0
n=len(X) # num of data point
k=len(list(set(y))) # num of class
d=len(X[0]) #dimension of feature
batchList=getBatches(n,m) # get batches
#initilize theta
theta=[]
for kk in range(0,k):
s = np.random.normal(0, 1, d+1)
theta.append(s)
theta=np.array(theta)
# theta = np.zeros((k,d+1)) #initialize the theta
eta=eta_start
epochList=[]
while count<=max_epoch:
print('eta', eta)
epochList.append(loss(X, y, theta))
print('epochList:', epochList)
print('epochListsize',len(epochList))
count+=1
theta_old = theta.copy()
for batch in batchList:
for c in range(0,k): #update the theta
gradience=gradient(batch,X,y,c,theta,eta)
theta[c]=theta[c]+gradience
# print('theta',theta)
# print('eta',eta)
print('loss',loss(X,y,theta))
if loss(X,y,theta_old)-loss(X,y,theta)< epsilon*loss(X,y,theta_old):
eta=eta/10
if eta<eta_end:
W=theta[0:k,0:d]
b=theta[0:k,d:d+1]
W = theta[0:k, 0:d]
b = theta[0:k, d:d + 1]
print("W",W)
print("b",b)
return W,b
W = theta[0:k, 0:d]
b = theta[0:k, d:d + 1]
return W,b
Training & Testing
def main():
#load Data
X_train,y_train=load("train.csv")
X_train=np.array(X_train)
y_train=np.array(y_train)
X_test,y_test=load("test.csv")
#preprocess
ss=pp.StandardScaler()
X_train=ss.fit_transform(X_train) #normalize X
X_test=ss.transform(X_test)
#train
m=256
eta_start=0.01
eta_end=0.00001
epsilon=0.0001
W,b=logreg_fit(X_train,y_train,m,eta_start,eta_end,epsilon,max_epoch=1000)
#test
y_pre=[]
y_pre=logreg_predict_class(W,b,X_test)
# for test in range(0,len(X_test)):
# y_pre.append(logreg_predict_class(W,b,X_test))
# y_pre=np.array(y_pre)
print(ms.accuracy_score(y_test,y_pre))
output accuracy: 0.8929088277858177
confusion matrix:
Loss descent w.r.t. each epoches