通过逻辑斯蒂回归实现手写数字1-9的分类
目的:理解softmax和sigmoid的差别,
加深对逻辑回归和softmax回归的理解
知识点:one-hot编码,softmax回归
数据的预处理+训练集的划分
import numpy as np
import pandas as pd
data = pd.read_csv("./digits.csv")
data[:5]
label | pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 785 columns
X = data.values
X = X[:,1:]
y = data['label']
y = y.values
display(X.shape,y.shape)
(42000, 784) (42000,)
第一列是label,其余列是图片像素矩阵的平铺向量,共784个像素值
看一下长什么样子
import matplotlib.pylab as plt
plt.figure(figsize=(3,3))
data1 = data.iloc[0][1:].values
display(data1.shape)
num1 = data1.reshape(28,28)
plt.imshow(num1,cmap='gray')
(784,)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KIAOq2DU-1689218911304)(output_6_2.png)]
784个特征太多了,而且大多数都是0的稀疏矩阵,计算没什么意义
这里利用PCA多特征进行降维,减少特征的维度
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95) #保留95%重要的信息
# 依然代表原来的数据
X_pca = pca.fit_transform(X)
X_pca.shape
(42000, 154)
切分训练集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_pca,y,test_size=0.1)
display(X_train.shape,X_test.shape,y_train.shape)
(37800, 154)
(4200, 154)
(37800,)
数据归一化
from sklearn.preprocessing import scale
np.set_printoptions(suppress=True)
X_train = scale(X_train)
X_test = scale(X_test)
X_test.shape
(4200, 154)
多分类建模
代码:
class My_SoftmaxRegression:
#默认没有正则化,正则项参数默认为1,学习率默认为0.001,迭代次数为10001次
def __init__(self,penalty = None,Lambda = 1,a = 0.001,epochs = 10001):
self.W = None
self.penalty = penalty
self.Lambda = Lambda
self.a = a
self.epochs =epochs
#softmax
self.softmax = lambda x : np.exp(x) / np.sum(np.exp(x), axis=0)
#损失函数
def loss(self,x,y):
m=x.shape[0]
#转化成概率
p = self.softmax(self.W * x.T)
return (-1/m) * np.multiply(y,np.log2(p)).sum()
#预测
def predict(self,X):
#加偏置
X = np.concatenate((np.ones((X.shape[0],1)),X),axis = 1)
y_p = self.W * X.T
# print(y_p.shape)
#概率
p = self.softmax(y_p)
# 每一列的最大概率索引即为标签
index = p.argmax(axis=0)
y_pre = np.array(index).reshape(-1,1)
return y_pre
def fit(self,x,y):
import numpy as np
lossList = []
#总样本数
m = x.shape[0]
#获取总的类别数
c = np.unique(y)
c_num = len(c)
#添加偏置项
X = np.concatenate((np.ones((m,1)),x),axis = 1)
#总特征数
n = X.shape[1]
#初始化W的值
self.W = np.mat(np.ones((c_num,n)))
xMat = np.mat(X)
from sklearn.preprocessing import LabelBinarizer
one_hot = LabelBinarizer() # 创建one-hot编码器
x_one = one_hot.fit_transform(y.reshape(-1,1))
yMat = np.mat(x_one.T)
print(yMat.shape)
#初始化loss
loss = 0
#前一次的loss
pre_loss = loss + 1
#循环epochs次
for i in range(self.epochs):
#预测值
p = self.softmax(self.W * xMat.T)
gradient = (p - yMat)/m * xMat
#加入l1和l2正则项,和之前的线性回归正则化一样
if self.penalty == 'l2':
gradient = gradient + self.Lambda * np.linalg.norm(self.W, ord=2)
elif self.penalty == 'l1':
gradient = gradient + self.Lambda * np.linalg.norm(self.W, ord=1)
self.W = self.W-self.a * gradient
#当前的loss
pre_loss = loss
loss = self.loss(xMat,yMat)
if i % 50 == 0:
lossList.append(loss)
#损失没什么变化,收敛退出迭代
if np.abs(pre_loss - loss) < 0.002:
break
#返回系数,和损失列表
return self.W,lossList,i
训练预测评估
%%time
import warnings
warnings.filterwarnings('ignore')
My_Softmax = My_SoftmaxRegression(penalty='l1')
W,lossList,times = My_Softmax.fit(X_train,y_train)
(10, 37800)
CPU times: total: 328 ms
Wall time: 142 ms
from sklearn.metrics import accuracy_score
y_pre = My_Softmax.predict(X_test)
display(y_pre.shape,y_test.shape)
my_score = accuracy_score(y_test,y_pre)
print("L1的softmax回归正确率:",my_score)
(4200, 1)
(4200,)
L1的softmax回归正确率: 0.8495238095238096
sklearn
from sklearn.linear_model import LogisticRegression
score = 0
model = LogisticRegression()
model.fit(X_train,y_train)
score += model.score(X_test,y_test)
print('逻辑回归得分:',score)
逻辑回归得分: 0.9183333333333333