代码、数据已经上传,可以自主下载。https://download.csdn.net/download/shenziheng1/10719760
1. 训练过程
import numpy as np
def load_data(file_name):
"""
input: file_name(string)
output: feature_data(mat)
label_data(mat)
"""
f = open(file_name,"r")
feature_data = []
label_data = []
for line in f.readlines():
feature_tmp = []
label_tmp = []
lines = line.strip().split("\t")
feature_tmp.append(1) # x0 = 1
for i in xrange(len(lines)-1):
feature_tmp.append(float(lines[i]))
label_tmp.append(float(lines[-1]))
feature_data.append(feature_tmp)
label_data.append(label_tmp)
f.close()
return np.mat(feature_data), np.mat(label_data)
def sig(x):
"""sigmiod function"""
return 1.0 / (1 + np.exp(-x))
def error_rate(h, label):
"""
input: h(mat) predicting data
label(mat) labels
output: err/m(float) false ratio
"""
m = np.shape(h)[0]
sum_err = 0.0
for i in xrange(m):
if h[i,0] > 0 and (1 - h[i,0]) >0:
sum_err = sum_err - (label[i,0] * np.log(h[i,0]) +
(1-label[i,0]) * np.log(1-h[i,0]))
else:
sum_err = sum_err
return sum_err / m
def lr_train_bgd(feature, label, maxCycle, alpha):
"""
input: feature(mat)
label(mat)
maxcycle(int)
alpha(float)
output: w(mat) weights
"""
n = np.shape(feature)[1] # the number of feature
w = np.mat(np.ones((n,1))) # the number of weight
i = 0
while i <= maxCycle:
i = i+1
h = sig(feature * w)
err = label - h
if i % 100 == 0:
print "\t--------iter=" + str(i) + \
", train error rate=" + str(error_rate(h, label))
w = w + alpha * feature.T * err # modifying weights
return w
def save_model(file_name, w):
"""
input: file_name(string) the filepath for saving model
w weights
"""
m = np.shape(w)[0]
f_w = open(file_name, "w")
w_array = []
for i in xrange(m):
w_array.append(str(w[i,0]))
f_w.write("\t".join(w_array))
f_w.close()
#def imgplot(feature, w):
if __name__ == "__main__":
""" import training data """
print "--------load data--------"
feature, label = load_data("data.txt")
""" training logistic regression model """
print "--------training--------"
w = lr_train_bgd(feature, label, 1000, 0.01)
""" save model """
print "--------save model---------"
save_model("weights",w)
训练结果为:
2. 测试代码
import numpy as np
from logistic_training import sig
def load_weight(w):
f = open(w)
w = []
for line in f.readlines():
lines = line.strip().split("\t")
w_tmp = []
for x in lines:
w_tmp.append(float(x))
w.append(w_tmp)
f.close()
return np.mat(w)
def load_data(file_name, n):
f = open(file_name)
feature_data = []
for line in f.readlines():
feature_tmp = []
lines = line.strip().split("\t")
if len(lines) <> n - 1:
continue
feature_tmp.append(1)
for x in lines:
feature_tmp.append(float(x))
feature_data.append(feature_tmp)
f.close()
return np.mat(feature_data)
def predict(data, w):
h = sig(data * w.T)
m = np.shape(h)[0]
for i in xrange(m):
if h[i, 0] < 0.5:
h[i, 0] = 0.0
else:
h[i, 0] = 1.0
return h
def save_result(file_name, result):
m = np.shape(result)[0]
tmp = []
for i in xrange(m):
tmp.append(str(result[i, 0]))
f_result = open(file_name, "w")
f_result.write("\t".join(tmp))
f_result.close()
if __name__ == "__main__":
"""loading LR model"""
print "--------load model---------"
w = load_weight("weights")
n = np.shape(w)[1]
"""loading testing data"""
testData = load_data("test_data", n)
"""predicting test data"""
print "--------prediction--------"
h = predict(testData, w)
print h
"""save prediction results"""
print "--------save prediction--------"
save_result("results", h)
3. 补充知识
- readlines(): 用于读取所有行(直到结束符 EOF)并返回列表,该列表可以由 Python 的 for... in ... 结构进行处理。如果碰到结束符 EOF 则返回空字符串。
- strip():用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
str = "00000003210Runoob01230000000";
print str.strip( '0' ); # 去除首尾字符 0
>>> 3210Runoob0123
- join(): 用于将序列中的元素以指定的字符连接生成一个新的字符串
str = "-";
seq = ("a", "b", "c"); # 字符串序列
print str.join( seq );
>>> a-b-c