1. 基于逻辑回归模型的多分类
"""
基于逻辑回归模型的多分类
案例: 手写数字识别
"""
import numpy as np
import scipy.io as sio
import scipy.optimize as opt
def load_data(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y') # (5000,1)
y = y.reshape(y.shape[0]) # make it back to column vector
X = data.get('X') # (5000,400) 每一行是一个数字图(20×20)
if transpose:
# for this dataset, you need a transpose to get the orientation right
X = np.array([im.reshape((20, 20)).T for im in X]) # 把每一行还原为20×20的(正常图片显示)二维数组形式,共5000行,每一行一个二维数组
# and I flat the image again to preserve the vector presentation
X = np.array([im.reshape(400) for im in X])
return X, y
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost_function(theta, X, y, λ): # 正则化的代价函数
first = y.T @ np.log(sigmoid(X @ theta))
second = (1 - y.T) @ np.log(1 - sigmoid(X @ theta))
reg = (λ / (2 * len(X))) * np.sum(np.power(theta[1:], 2))
# print(first.shape,second.shape,reg)
return -np.sum(first + second) / len(X) + reg
def gradient(theta, X, y, λ):
theta = np.mat(theta) # theta变为1×401,矩阵化之前为(401,),如果是(401,1)就不会发生改变
X = np.mat(X) # 如果是矩阵化之前是二维数组形式,格式化后维数不变
y = np.mat(y)
error = sigmoid(X * theta.T) - y
grad = ((X.T * error) / len(X)).T + ((λ / len(X)) * theta)
# intercept gradient is not regularized
grad[0, 0] = np.sum(np.multiply(error, X[:, 0])) / len(X)
return np.array(grad).ravel()
def one_vs_all(X, y, num_labels, λ):
rows = X.shape[0]
params = X.shape[1]
all_theta = np.zeros((num_labels, params))
# 在X第一列插入1,计算截距(常数项)θ0
for i in range(1, num_labels + 1):
theta = np.zeros(params)
y_i = np.array([1 if label == i else 0 for label in y])
y_i = np.reshape(y_i, (rows, 1))
fmin = opt.minimize(fun=cost_function, x0=theta, args=(X, y_i, λ), method='TNC', jac=gradient)
all_theta[i - 1, :] = fmin.x # 存入一行(按列方向第0行,1行...排列)
return all_theta
def predict(X, theta_final):
# compute the class probability for each class on each training instance
h = sigmoid(X @ theta_final.T) # (5000,401) (10,401) =>(5000,10)
# create array of the index with the maximum probability
h_argmax = np.argmax(h, axis=1) # axis=1 按行(每一行最大值的索引)返回最大值索引
# 返回默认最大索引值是0-9,为了与实际label数据(1-10)对应,对其加1处理
return h_argmax + 1
if __name__ == "__main__":
raw_X, raw_y = load_data('ex3data1.mat') # raw_X--(5000,400),raw_y--(5000,)
rows = raw_X.shape[0]
params = raw_X.shape[1]
# 在X第一列插入1,计算截距(常数项)θ0
X = np.insert(raw_X, 0, values=np.ones(rows), axis=1) # axis=1沿行方向添加value=1的列 --插入了第一列(全部为1),X变为(5000,401)
theta = np.zeros(params + 1)
y_0 = np.reshape(raw_y, (rows, 1)) # (5000,1)
print(y_0.shape)
print(np.unique(y_0))
all_theta = one_vs_all(X, y_0, 10, 1)
print(all_theta.shape)
# 精度验证
y_pred = predict(X, all_theta)
y_pred = y_pred.reshape((rows, 1)) # 注:两个数组(预测的label与真实label)维数一定保持完全相同
print(y_pred.shape)
print('Accuracy={}'.format(np.mean(y_0 == y_pred)))
2. 数据加载显示
"""
案例: 手写数字识别
"""
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import matplotlib
def load_data(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y') # (5000,1)
y = y.reshape(y.shape[0]) # make it back to column vector
X = data.get('X') # (5000,400) 每一行是一个数字图(20×20)
print(X.shape, y.shape)
if transpose:
# for this dataset, you need a transpose to get the orientation right
X = np.array([im.reshape((20, 20)).T for im in X]) # 把每一行还原为20×20的(正常图片显示)二维数组形式,共5000行,每一行一个二维数组
# and I flat the image again to preserve the vector presentation
X = np.array([im.reshape(400) for im in X])
print(len([1 if label == 0 else 0 for label in y]))
return X, y
def plot_an_image(image): # 绘图函数
# """
# image : (400,)
# """
fig, ax = plt.subplots(figsize=(1, 1))
ax.matshow(image.reshape((20, 20)),
cmap=matplotlib.cm.binary) # matshow()函数绘制矩阵,cmap意思是color map,颜色方案,binary代表是白底黑字
plt.xticks(np.array([])) # just get rid of ticks
plt.yticks(np.array([])) # 一个是刻标(locs),一个是刻度标签(tick labels),不显示tick可以传入空的参数(不显示刻度)
def plot_100_image(X): # 绘图函数,画100张图片
""" sample 100 image and show them
assume the image is square
X : (5000, 400)
"""
size = int(np.sqrt(X.shape[1]))
# sample 100 image, reshape, reorg it
sample_idx = np.random.choice(np.arange(X.shape[0]), 100) # 100*400 5000张图片中取出100张(5000行里随机取100行)
sample_images = X[sample_idx, :]
fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8)) # sharey=True共享x,y轴
for r in range(10):
for c in range(10):
ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
if __name__ == '__main__':
X, y = load_data('ex3data1.mat')
pick_one = np.random.randint(0, 5000)
plot_100_image(X)
plot_an_image(X[pick_one, :])
plt.show()
print('this should be {}'.format(y[pick_one]))
3. 神经网络模型-分类
"""
基于神经网络的多分类问题
案例: 手写数字识别
"""
import numpy as np
import scipy.io as sio
from sklearn.metrics import classification_report # 这个包是评价报告
def load_data(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y') # (5000,1)
y = y.reshape(y.shape[0]) # make it back to column vector
X = data.get('X') # (5000,400) 每一行是一个数字图(20×20)
if transpose:
# for this dataset, you need a transpose to get the orientation right
# X = np.array([im.reshape((20, 20)).T for im in X]) # 把每一行还原为20×20的(正常图片显示)二维数组形式,共5000行,每一行一个二维数组
# and I flat the image again to preserve the vector presentation
X = np.array([im.reshape(400) for im in X])
return X, y
def load_weight(path):
data = sio.loadmat(path)
return data["Theta1"], data["Theta2"]
def sigmoid(z):
return 1 / (1 + np.exp(-z))
if __name__ == "__main__":
raw_X, raw_y = load_data('ex3data1.mat') # raw_X--(5000,400),raw_y--(5000,)
X = np.insert(raw_X, 0, values=np.ones(raw_X.shape[0]), axis=1) # 插入了第一列(全部为1),X变为(5000,401)
theta1, theta2 = load_weight("ex3weights.mat") # theta1为(25,401),theta2为(10,26)
a1 = X
z2 = a1 @ theta1.T # (5000, 401) @ (25,401).T = (5000, 25)
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, values=1, axis=1) # (5000,26)
print(a2.shape)
z3 = a2 @ theta2.T # (5000,26) @ (26,10) = (5000,10)
a3 = sigmoid(z3)
y_pred = np.argmax(a3, axis=1)
y_pred = y_pred + 1
print(y_pred)
print('Accuracy={}'.format(np.mean(raw_y == y_pred)))
print(classification_report(raw_y, y_pred))
4. 训练一维和多维模型-分类
"""
一维和多维模型分类
案例: 手写数字识别
"""
import numpy as np
import scipy.io as sio
import scipy.optimize as opt
from sklearn.metrics import classification_report # 这个包是评价报告
def load_data(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y') # (5000,1)
y = y.reshape(y.shape[0]) # make it back to column vector
X = data.get('X') # (5000,400) 每一行是一个数字图(20×20)
if transpose:
# for this dataset, you need a transpose to get the orientation right
X = np.array([im.reshape((20, 20)).T for im in X]) # 把每一行还原为20×20的(正常图片显示)二维数组形式,共5000行,每一行一个二维数组
# and I flat the image again to preserve the vector presentation
X = np.array([im.reshape(400) for im in X])
return X, y
def load_weight(path):
data = sio.loadmat(path)
return data["Theta1"], data["Theta2"]
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost_function(theta, X, y, λ): # 正则化的代价函数
first = y.T @ np.log(sigmoid(X @ theta))
second = (1 - y.T) @ np.log(1 - sigmoid(X @ theta))
reg = (λ / (2 * len(X))) * np.sum(np.power(theta[1:], 2))
# print(first.shape,second.shape,reg)
return -np.sum(first + second) / len(X) + reg
def gradient_reg(theta, X, y, λ):
reg = theta[1:] * (λ / len(X))
reg = np.insert(reg, 0, values=0, axis=0)
first = (X.T @ (sigmoid(X @ theta) - y)) / len(X)
return first + reg
def logistic_regression(X, y, λ=1):
"""generalized logistic regression
args:
X: feature matrix, (m, n+1) # with incercept x0=1
y: target vector, (m, )
l: lambda constant for regularization
return: trained parameters
"""
# init theta
theta = np.zeros(X.shape[1])
# train it
res = opt.minimize(fun=cost_function,
x0=theta,
args=(X, y, λ),
method='TNC',
jac=gradient_reg,
options={'disp': True})
# get trained parameters
final_theta = res.x
return final_theta
def predict(X, theta):
prob = sigmoid(X @ theta)
return (prob >= 0.5).astype(int)
if __name__ == "__main__":
raw_X, raw_y = load_data('ex3data1.mat') # raw_X--(5000,400),raw_y--(5000,)
from collections import Counter
print(Counter(raw_y==10))
X = np.insert(raw_X, 0, values=np.ones(raw_X.shape[0]), axis=1) # 插入了第一列(全部为1),X变为(5000,401)
# y have 10 categories here. 1..10, they represent digit 0 as category 10 because matlab index start at 1
# I'll ditit 0, index 0 again
y_matrix = []
for k in range(1, 11):
y_matrix.append((raw_y == k).astype(int)) # 见配图 "向量化标签.png"
# last one is k==10, it's digit 0, bring it to the first position,最后一列k=10,都是0,把最后一列放到第一列
print(raw_y,Counter(y_matrix[0]))
y_matrix = [y_matrix[-1]] + y_matrix[:-1] # 把最后k=10的数组,移到开头第一组的位置
y = np.array(y_matrix) # y为(10,5000)
# train k model(训练一维模型)
theta0_final = logistic_regression(X, y[0])
print(theta0_final.shape)
y_pred = predict(X, theta0_final)
print('Accuracy={}'.format(np.mean(y[0] == y_pred)))
# train k model(训练k维模型)
k_theta = np.array([logistic_regression(X, y[k]) for k in range(10)]) # k_theta(10,401)
print(k_theta.shape)
y_preds = predict(X, k_theta.T).T # y_reds(10,5000)
print(y_preds.shape)
for i in range(10):
print('Accuracy{}={}'.format(i,np.mean(y[i] == y_preds[i])))
# 混淆矩阵精度验证
prob_matrix = sigmoid(X @ k_theta.T) # prob_matrix为(5000,10)的数组
np.set_printoptions(suppress=True) # 设置打印样式,使输入出结果更美观
print(prob_matrix.shape)
y_pre = np.argmax(prob_matrix, axis=1) # 返回沿轴axis最大值的索引,axis=1代表行--还原回与原始raw_y数据相同的维数,但类别标签不同,一个0-9,一个1-10
print(y_pre)
print(np.unique(y_pre))
# 数据处理,把raw_y数据的标签范围处理成0-9范围内
y_answer = raw_y.copy()
y_answer[y_answer == 10] = 0
print(classification_report(y_answer, y_pre)) # 输出评价报告(包括准确率,召回率,否f1得分等)