训练data以及源文件(python2.x)见作者(赵志勇)的github:
https://github.com/zhaozhiyong19890102/Python-Machine-Learning-Algorithm
以下的文件是修改过的,适用于 python 3.x
1. 训练文件 train.py
# coding:UTF-8
import numpy as np
def sig(x):
'''sigmoid function
:param x:
:return:
'''
return 1.0/(1+np.exp(-x)) # 1.0 equals to 1, the calculation returns a double
def error_rate(h, label):
'''compute lost fuction value
:param h:
:param label:
:return:
'''
m = np.shape(h)[0] # return the row of h; shape() return the size of h
sum_err = 0.0
for i in range(m):
if h[i, 0] > 0 and (1-h[i,0])>0:
temp = h[i,0] + (1-label[i,0]) * np.log(1-h[i,0])
if temp < 0 : # input of log should be positive;
m-=1
continue
sum_err -= (label[i,0]*np.log(temp))
else:
sum_err -= 0
return sum_err / m
def lr_train_bgd(feature, label, maxCycle, alpha):
''' gradient descent
:param feature:
:param label:
:param maxCycle:
:param alpha:
:return:
'''
n = np.shape(feature)[1] # shape return the size of feature, [rows, cols]; [1] means set n as the second element of the return list;
w = np.mat(np.ones((n,1))) # new matrix, rows = n, cols = 1; initialize w as ones
i = 0
while i<= maxCycle:
i += 1
# feature size should be (m,n), w size is (n, 1), h size is (m, 1);
# m is the number of sample, n is the dimension of a sample;
h = sig(feature * w)
err = label - h # err size is (m, 1)
if i % 100 == 0:
print("\t--------iter=" + str(i) + \
", train error rate= " + str(error_rate(h, label)))
# w updating rule of batch gradient decent (matrix style);
# alpha is a number; feature size is (m, n); T means to transpose a matrix; err size is (m, 1)
# so the size of w is (n, 1);
w = w + alpha * feature.T * err # w size is (n, 1)
return w
def load_data(file_name):
'''导入训练数据
input: file_name(string)训练数据的位置
output: feature_data(mat)特征
label_data(mat)标签
'''
f = open(file_name) # 打开文件
feature_data = [] # declare a list
label_data = []
for line in f.readlines(): # read a line, and loop each elements of it
feature_tmp = []
lable_tmp = []
# string.strip() means to remove the leading and trailing whitespace
# string.split("x") means to separate the string into several sub_strings with "x"
lines = line.strip().split("\t") # lines is a list
feature_tmp.append(1) # formula: x0 = 1, check the book for detail;
for i in range(len(lines) - 1):
feature_tmp.append(float(lines[i])) # conbine a sample with the elements;
lable_tmp.append(float(lines[-1])) # list[-1] means the last element
feature_data.append(feature_tmp) # add element to list
label_data.append(lable_tmp)
f.close() # closing file
return np.mat(feature_data), np.mat(label_data) # mat 1Xn,these data will be reshape in next function;
def save_model(file_name, w):
m = np.shape(w)[0]
f_w = open(file_name, "w")
w_array = []
for i in range(m): # xrange is not suitable for python 3.x, should be replaced by range;
w_array.append(str(w[i, 0]))
f_w.write("\t".join(w_array)) # add "\t" to the intervals of every two adjacent letters;
f_w.close()
if __name__ == "__main__":
# load file
print("---------- 1.load data ------------")
feature, label = load_data("data.txt")
# train
print("---------- 2.training ------------")
w = lr_train_bgd(feature, label, 1000, 0.01)
# save
print("---------- 3.save model ------------")
save_model("weights", w)
2 测试文件 test.py
# coding:UTF-8
import numpy as np
from logistic_regression import sig
def load_weight(w):
f = open(w)
w = []
for line in f.readlines():
lines = line.strip().split("\t")
w_tmp = []
for x in lines:
w_tmp.append(float(x))
w.append(w_tmp)
f.close()
return np.mat(w)
def load_data(file_name, n):
f = open(file_name)
feature_data = []
for line in f.readlines():
feature_tmp = []
lines = line.strip().split("\t")
# print lines[2]
if len(lines) != n - 1:
continue
feature_tmp.append(1)
for x in lines:
# print x
feature_tmp.append(float(x))
feature_data.append(feature_tmp)
f.close()
return np.mat(feature_data)
def predict(data, w):
h = sig(data * w.T)#sig
m = np.shape(h)[0]
for i in range(m):
if h[i, 0] < 0.5:
h[i, 0] = 0.0
else:
h[i, 0] = 1.0
return h
def save_result(file_name, result):
m = np.shape(result)[0]
tmp = []
for i in range(m):
tmp.append(str(result[i, 0]))
f_result = open(file_name, "w")
f_result.write("\t".join(tmp))
f_result.close()
if __name__ == "__main__":
# 1
print("---------- 1.load model ------------")
w = load_weight("weights")
n = np.shape(w)[1]
# 2
print ("---------- 2.load data ------------")
testData = load_data("test_data", n)
# 3
print ("---------- 3.get prediction ------------")
h = predict(testData, w)#
# 4
print ("---------- 4.save prediction ------------")
save_result("result2", h)