概述
Logistic回归是在线性回归的最后一步的基础上引入了激活函数—sigmoid函数 ,将回归问题变成了0-1的分类问题,比如一封邮件是否为垃圾邮件,此瓜是否是好瓜等二分类问题。
sigmoid函数
极大似然法
对数几率
因此有:
梯度下降法
其中关于 的一阶、二阶导数分别为
代码实现
import numpy as np
from sklearn.preprocessing import OneHotEncoder
class Logistic:
def __init__(self, max_depth=5000):
self.sep = 0.01
self.onehot = OneHotEncoder()
self.max_depth = max_depth
def fit(self, train_x, train_y):
self.train_x = np.mat(train_x)
m, n = self.train_x.shape
# print(m,n)
self.simple = m # 样本数量
self.W = np.ones((n, 1)) # 初始权重
self.train_y = np.mat(train_y)
if self.train_y.shape[0] == 1:
self.train_y = np.mat(self.train_y.reshape((m, 1)))
self.W = self.Grade() # 最优权重
# 提取训练数据训练特征
def fit_transform(self, train_List, train_y=None):
x = np.array(train_List).T
y = np.array(train_y).T
train_x = self.onehot.fit_transform(x, y)
return train_x, y
def transform(self, test_x):
test_x = np.array(test_x)
return self.onehot.transform(test_x)
# sigmod 函数
def sigmod(self, X):
return 1/(1 + np.exp(-X))
# 梯度下降, 得到最优权重w
def Grade(self):
X = self.train_x
Y = self.train_y
for i in range(self.max_depth):
grad = X.T*(X * self.W - Y) # 梯度
self.W = self.W - self.sep * grad
# print(self.W)
return self.W
def predict(self, test_x):
data_y = self.sigmod(test_x * self.W)
# print(data_y)
predict_y = []
for y in data_y:
if y > 0.5:
predict_y.append(1)
else:
predict_y.append(-1)
return predict_y
if __name__ == '__main__':
train_List = [[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
['S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L', 'L']
]
train_y = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
test_data = [[1, "S"], [2, "S"], [3, "S"]]
Logic = Logistic()
train_x, train_y = Logic.fit_transform(train_List, train_y)
train_x = train_x.toarray()
Logic.fit(train_x, train_y)
test_x = Logic.transform(test_data)
test_x = test_x.toarray()
print("测试数据为:{}, 预测类别为:{}".format(test_data, Logic.predict(np.mat(test_x))))
总结
无需事先假设数据分布;可得到“类别”的近似概率预测(概率值还可用于后续应用);可直接应用现有数值优化算法(如牛顿法)求取最优解,具有快速、高效的特点。