lr模型处理某二分类问题,采用随机梯度下降
import numpy as np
import matplotlib.pyplot as plt
import random
import time
def split_data(onelinedata):
onelinedata = onelinedata.split(",")
x = list(map(float, onelinedata))
# x = [float(item) for item in onelinedata]
# x = [str2float(item) for item in onelinedata]
return x
def make_train_data(path, chunk_size):
with open(path) as f:
chunk_data = f.readlines(chunk_size)
x = np.array([split_data(item) for item in chunk_data], dtype=np.float16)
return x
def make_test_data(path, chunk_size):
with open(path) as f:
chunk_data = f.readlines(chunk_size)
x = np.array([split_data(item) for item in chunk_data], dtype=np.float16)
return x
#begin_time=time.time()
train_path = "./train_data.txt"
test_path1 = "./test_data.txt"
test_path2 = "./answer.txt"
train_chunk_size = 10000 # MB
test_chunk_size = 100 # MB
train_xy = make_train_data(train_path, chunk_size=train_chunk_size*1024*1024)
train_x = train_xy[:, :-1] #input
train_y = train_xy[:,-1] #output
test_x=make_test_data(test_path1,chunk_size=test_chunk_size*1024*1024)
test_y=make_test_data(test_path2,chunk_size=test_chunk_size*1024*1024)
#print(train_x.shape)
#print(train_y.shape)
#print(train_xy.shape)
#print(test_x.shape)
#print(test_y.shape)
def sigmoid(inx):
if inx>=0: #对sigmoid函数的优化,避免了出现极大的数据溢出
return 1.0/(1+np.exp(-inx))
else:
return np.exp(inx)/(1+np.exp(inx))
def gradAscnet(input,output,test_x,test_y):
dataMatrix=np.mat(input)
labelMatrix=np.mat(output)
m,n=input.shape
maxCycles=100
weights=np.ones((n,1))
# temp_y=np.zeros(test_y.shape)
for j in range(maxCycles):
dataIndex=list(range(m))
errorsum = np.zeros((1,1))
for i in range(m):
alpha=4/(1.0+j+i)+0.01
randIndex=int(random.uniform(0,len(dataIndex)))
h=sum(dataMatrix[randIndex,:]*weights)
# print(labelMatrix[0,randIndex])
h=sigmoid(h)
error=labelMatrix[0,randIndex]-h
weights=weights+alpha*error[0,0]*(dataMatrix[randIndex,:].transpose())
errorsum=errorsum+error
del(dataIndex[randIndex])
# print(weights[0:4,0].transpose(),errorsum)
# pred_y = predict(test_x, weights)
# pred_y = np.round(pred_y)
# print((pred_y.size - sum(abs(pred_y - test_y))) / pred_y.size)
# #print((pred_y.size - sum(abs(pred_y - temp_y))) / pred_y.size)
# print(" ")
# temp_y=pred_y
return weights
def predict(input,w):
dataMatrix=np.mat(input)
weights=np.mat(w)
output=dataMatrix*weights
for i in range(output.size):
output[i,0]=sigmoid(output[i,0])
return output
weights=gradAscnet(train_x,train_y,test_x,test_y)
pred_y=predict(test_x,weights)
pred_y=np.round(pred_y)
print((pred_y.size-sum(abs(pred_y-test_y)))/pred_y.size) #accuracy rate
#end_time=time.time()
#print(end_time-begin_time) #second
np.savetxt("./result.txt",pred_y,fmt='%d',delimiter='\n')