鸢尾花数据集用logistic多分类训练
logistic这个多分类就是有几类就搞几个分类器,每个分类器对应一类。每个分类器分成两类是本类和不是本类(1和0)。
梯度下降法,logistic损失函数求导就核心两点嘛。不难。
代码
from sklearn import datasets
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
import numpy
iris = datasets.load_iris()
data_x = iris["data"]
#7到9行给每个x参数列表前面开始加1
data_x_1 = []
for data_j in data_x:
data_x_1.append(numpy.array([1]+list(data_j)))
data_x = numpy.array(data_x_1)
# print("data_x",data_x)
data_y = iris["target"]
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.3)#拆分成测试集,训练集
# print(x_train[0])
theta0 = 1
theta1 = 1
theta2 = 1
theta3 = 1
theta4 = 1
#设最开始每个参数为1
theta_list = numpy.array([theta0,theta1,theta2,theta3,theta4])
y_class_list = []
y =3#一共有三类
#25到28把数据分成3个分类器标签,只有0和1
for i in range(y):
y_train_0 = (y_train==i)
y_train_0 =y_train_0.astype(numpy.int)
y_class_list.append(y_train_0)
# print(y_train_0)
def cost(array_x,y,j,theta_list_yi):
a = -1*theta_list_yi.dot(array_x)
h = 1/(numpy.e**a+1)
cost = (h - y)*array_x[j]
return cost
def renew(j,learn_possibility,y_class,theta_list_yi):
'''
:param j: 更新的是第几个参数
:param learn_possibility:学习效率
:param y_class:哪个分类器,决定使用哪个y训练集标签
:param theta_list_yi: 参数列表
:return: 参数列表中一个更新完成的参数
'''
m = len(x_train)
cost_all = 0
theat_j = theta_list_yi[j]
# print("theat_j",theat_j)
#每一次更新都遍历一整遍训练集
for i in range(m):
cost_j =cost(x_train[i],y_class_list[y_class][i],j,theta_list_yi)
cost_all+=cost_j
theat_j = 0.99*theat_j - learn_possibility*cost_all/m # 0.99是考虑正则
return theat_j
renew_num = 100#更新次数
learning = 0.03#学习效率
theta_final_list = []#结束后存放的是3个分类器参数列表
for class_i in range(y):
theta_list_yi = theta_list #每次参数初始化都为1
# print("theta_list_yi",theta_list_yi)
for renew_i in range(renew_num):#每一组参数更新renew_num次
new_theta_list = []#这是为了严谨不能一组没全部更新完就直接修改原先参数
for j in range(5):
renew_theta_j=renew(j,learning,class_i,theta_list_yi)
# print(renew_theta_j,"renew_theta_j")
new_theta_list.append(renew_theta_j)
# print("new_theta_list",new_theta_list)
theta_list_yi = numpy.array(new_theta_list)
theta_final_list.append(theta_list_yi)
for i in theta_final_list:
print(i)#得到三个分类器
final_y_predict_list =[]
for i in range(len(x_test)):
#获取每个分类器概率,选出最大的添加到final_y_predict_list
class_list1 =[]
for class_i in range(y):
a = theta_final_list[class_i].dot(x_test[i])
h = 1/(1+numpy.e**(-1*a))
class_list1.append(h)
predict_y = class_list1.index(max(class_list1))
final_y_predict_list.append(predict_y)
print(final_y_predict_list)
print(list(y_test))
predict_right_num = 0
for i in range(len(final_y_predict_list)):
if final_y_predict_list[i] == y_test[i]:
predict_right_num+=1
probability = predict_right_num/len(final_y_predict_list)
print("预测正确概率",probability)
三连嘻嘻嘻。