Softmax Regression python实现。
权重训练文件:
import numpy as np
import time
# file_path1 = open("./Data.txt", "r")
file_path1 = 'Data.txt'
def load_dataset(fileinput):
'''
导入数据
input: fileinput(string)数据文件的位置
output: feature_data(mat)特征样例,一行一个训练样例
label_data(mat)标签数据,列向量
k(int)类别数
'''
f = open(fileinput, "r")
feature_data = []
label_data = []
for line in f.readlines():
feature_temp = []
feature_temp.append(1)
lines = line.strip().split()
for i in range(len(lines)-1):
feature_temp.append(float(lines[i]))
label_data.append(float(lines[-1]))
feature_data.append(feature_temp)
f.close()
return np.mat(feature_data), np.mat(label_data).T, len(set(label_data))
def cost(err, label_data):
'''计算损失函数值
input: err(mat):exp的值
label_data(mat):标签的值
output: sum_cost / m(float):损失函数的值
'''
m = np.shape(err)[0]
sum_cost = 0.0
for i in range(m):
if err[i, int(label_data[i, 0])] / np.sum(err[i, :]) > 0:
sum_cost -= np.log(err[i, label_data[i, 0]] / np.sum(err[i, :]))
else:
sum_cost -= 0
return sum_cost / m
def gradientAscent(feature_data, label_data, k, maxCycle, alpha):
'''利用梯度下降法训练Softmax模型
input: feature_data(mat):特征
label_data(mat):标签
k(int):类别的个数
maxCycle(int):最大的迭代次数
alpha(float):学习率
output: weights(mat):权重,每列对应一个类别
'''
m, n = np.shape(feature_data)
weights = np.mat(np.ones((n, k)))
i = 0
while i <= maxCycle:
error = np.exp(feature_data * weights)
rowsum = -error.sum(axis=1)
rowsum = rowsum.repeat(k, axis=1)
error = error / rowsum
# print(m)
for x in range(m):
error[x, int(label_data[x, 0])-1] = error[x, int(label_data[x, 0])-1] + 1
# print("hahaha")
weights = weights + (alpha / m) * feature_data.T * error
i += 1
return weights
def save_model(file_name, weights):
'''保存最终的模型
input: file_name(string):保存的文件名
weights(mat):softmax模型
'''
f_w = open(file_name, "w")
m, n = np.shape(weights)
for i in range(m):
w_tmp = []
for j in range(n):
w_tmp.append(str(weights[i, j]))
f_w.write("\t".join(w_tmp) + "\n")
f_w.close()
def test():
x, y, z = load_dataset(file_path1)
print('开始训练模型权重', time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()))
weights = gradientAscent(x, y, z, 1000, 0.8)
print('结束训练模型权重', time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()))
print('保存模型权重文件:weights')
save_model("weights", weights)
if __name__ == "__main__":
test()
测试训练权重文件:
'''
'''
import numpy as np
import random as rd
def load_weights(weights_path):
'''导入训练好的Softmax模型
input: weights_path(string)权重的存储位置
output: weights(mat)将权重存到矩阵中
m(int)权重的行数
n(int)权重的列数
'''
f = open(weights_path)
w = []
for line in f.readlines():
w_tmp = []
lines = line.strip().split("\t")
for x in lines:
w_tmp.append(float(x))
w.append(w_tmp)
f.close()
weights = np.mat(w)
m, n = np.shape(weights)
return weights, m, n
def load_dataset(fileinput):
'''
导入数据
input: fileinput(string)数据文件的位置
output: feature_data(mat)特征样例,一行一个训练样例
label_data(mat)标签数据,列向量
k(int)类别数
'''
f = open(fileinput)
feature_data = []
label_data = []
for line in f.readlines():
feature_temp = []
feature_temp.append(1)
lines = line.strip().split()
for i in range(len(lines) - 1):
feature_temp.append(float(lines[i]))
label_data.append(float(lines[-1]))
feature_data.append(feature_temp)
f.close()
return np.mat(feature_data), np.mat(label_data).T, len(set(label_data))
def load_data(num, m):
'''导入测试数据
input: num(int)生成的测试样本的个数
m(int)样本的维数
output: testDataSet(mat)生成测试样本
'''
testDataSet = np.mat(np.ones((num, m)))
print(testDataSet)
for i in range(num):
testDataSet[i, 1] = rd.random() * 6 - 3 # 随机生成[-3,3]之间的随机数
testDataSet[i, 2] = rd.random() * 15 # 随机生成[0,15]之间是的随机数
print(testDataSet)
return testDataSet
def predict(test_data, weights):
'''利用训练好的Softmax模型对测试数据进行预测
input: test_data(mat)测试数据的特征
weights(mat)模型的权重
output: h.argmax(axis=1)所属的类别
'''
h = test_data * weights
return h.argmax(axis=1) # 获得所属的类别
def save_result(file_name, result):
'''保存最终的预测结果
input: file_name(string):保存最终结果的文件名
result(mat):最终的预测结果
'''
f_result = open(file_name, "w")
m = np.shape(result)[0]
for i in range(m):
f_result.write(str(result[i, 0]) + "\n")
f_result.close()
def findamenum(num1, num2):
i = j = 0
for lc in range(len(num1)):
if num1[lc] == num2[lc]:
i += 1
else:
j += 1
return i / len(num1)
if __name__ == "__main__":
# 1、导入Softmax模型
print("---------- 1.load model ------------")
w, m, n = load_weights("weights")
# 2、导入测试数据
print("---------- 2.load data ------------")
file_path1 = 'test_dataset.txt'
x, y, z = load_dataset(file_path1)
# test_data = load_data(4000, m)
# 3、利用训练好的Softmax模型对测试数据进行预测
print("---------- 3.get Prediction ------------")
result = predict(x, w) + 1
# 4、保存最终的预测结果
print("---------- 4.save prediction ------------")
save_result("result", result)
# 测试数据79个
#
print("预测概率")
print(findamenum(y, result))
数据集:训练集,测试集。
数据集分布文件:
import numpy as np
# import matplotlib.pyplot as plt
# from matplotlib.font_manager import FontProperties
a = np.loadtxt('Data.txt')
b = a[:, 0: 2]
print(a)
def gen_dataset():
# from sklearn.datasets import load_iris
# from sklearn.model_selection import train_test_split
# from sklearn.datasets import make_blobs
import matplotlib.pyplot as pltt
# 绘制数据分布
pltt.figure(figsize=(6, 4))
# 显示图大小
pltt.scatter(a[:, 0], a[:, 1], c=a[:, 2])
# 获得数据特征和类标签
pltt.title("20208223056")
pltt.xlabel("x feature")
pltt.ylabel("y feature")
pltt.show()
######
if __name__ == "__main__":
gen_dataset()
下载 提取码:wlo3