二元分类是机器学习中最基础的问题之一,在这份教学中,你将学会如何实作一个线性二元分类器,来根据人们的个人资料,判断其年收入是否高于 50,000 美元。我们将以两种方法: logistic regression 与 generative model,来达成以上目的,你可以尝试了解、分析两者的设计理念及差别。 实现二分类任务:
个人收入是否超过50000元?
直接上代码
1.导入数据
np.random.seed(0)
X_train_fpath = 'work/data/X_train'
Y_train_fpath = 'work/data/Y_train'
X_test_fpath = 'work/data/X_test'
output_fpath = 'work/output_{}.csv'
# 将csv文件解析为numpy数组
with open(X_train_fpath) as f:
next(f)
X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
next(f)
Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
next(f)
X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
2.标准/规范化
def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):
#这个函数规格化X的特定列。
#处理测试数据时重用训练数据的均值和标准方差。
# 参数说明:
# X: 要处理的数据
# train: 'True' 处理训练集, 'False'处理测试集
# specific_column:要被规范化的列的索引.如果是'None',则所有列都将被规范化。
# X_mean:训练数据的平均值,当train = 'False'才用
# X_std: 训练数据的标准差, 当train = 'False'才用
# 输出:
# X: 规范化后的数据
# X_mean:训练集的均值
# X_std: 训练集的标准差
if specified_column is None:
specified_column = np.arange(X.shape[1])
if train:
X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
X_std = np.std(X[:, specified_column], 0).reshape(1, -1)
X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
return X, X_mean, X_std
3.分割测试集和验证集
def _train_dev_split(X, Y, dev_ratio = 0.25):
# This function spilts data into training set and development set.
train_size = int(len(X) * (1 - dev_ratio))
return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]
# 先规范化训练集和测试集
X_train, X_mean, X_std = _normalize(X_train, train = True)
X_test, _, _ = _normalize(X_test, train = False, specified_column = None, X_mean = X_mean, X_std = X_std)
# 将数据分为训练集和验证集
dev_ratio = 0.1
# 9:1
X_train, Y_train, X_eval, Y_eval = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)
print('Size of training set: {}'.format(X_train.shape[0]))
print('Size of eval set: {}'.format(X_eval.shape[0]))
print('Size of testing set: {}'.format(X_test.shape[0]))
print('Dimension of data: {}'.format(data_dim))
4.数据增强及小工具
def _shuffle(X, Y):
# 这个函数扰乱两个等长的列表/数组X和Y。
randomize = np.arange(len(X))
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
def _sigmoid(z):
# Sigmoid函数可以用来计算概率。
#为了避免溢出,设置了最小/最大输出值。
return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))
def _f(X, w, b):
#这是logistic回归函数,由w和b参数化
# 参数说明:
# X: 输入, shape = [batch_size, data_dimension]
# w: weight vector, shape = [data_dimension, ]
# b: bias, scalar
# 输出:
# X每一行正标记的预测概率,shape = [batch_size,]
return _sigmoid(np.matmul(X, w) + b)
def _predict(X, w, b):
#这个函数为X的每一行返回一个真值预测
#通过对logistic回归函数的结果舍入。
return np.round(_f(X, w, b)).astype(np.int)
def _accuracy(Y_pred, Y_label):
# 此函数计算预测精度
acc = 1 - np.mean(np.abs(Y_pred - Y_label))
return acc
5.损失函数和梯度
# 交叉熵损失 if 0-1损失 第二项为0
def _cross_entropy_loss(y_pred, Y_label):
# 这个函数计算交叉熵。
#
# 参数说明:
# y_pred: 概率预测,float向量
# Y_label:真值标签,boolean向量
# 输出:
# 交叉熵,标量
cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
return cross_entropy
#
def _gradient(X, Y_label, w, b):
# 这个函数计算关于权重w和偏误b的交叉熵损失的梯度。
y_pred = _f(X, w, b)
pred_error = Y_label - y_pred
w_grad = -np.sum(pred_error * X.T, 1)
b_grad = -np.sum(pred_error)
return w_grad, b_grad
6.训练及可视化
# 初始化w和b
w = np.zeros((data_dim, ))
b = np.zeros((1,))
# 训练参数
EPOCH = 10
batch_size = 128
learning_rate = 0.2
# 储存下每一次迭代的loss和精确度,用来画图
train_loss = []
eval_loss = []
train_acc = []
eval_acc = []
# 计算参数更新的次数
adagrad_step = 1
batch_step = 0
# 迭代训练
for epoch in range(EPOCH):
# 随机打乱数据
X_train, Y_train = _shuffle(X_train, Y_train)
# Mini-batch training
step = 0
steps = int(np.floor(train_size / batch_size))
for idx in range(steps): # floor(48830/128)=382
X = X_train[idx*batch_size:(idx+1)*batch_size]
Y = Y_train[idx*batch_size:(idx+1)*batch_size]
# 计算梯度
w_grad, b_grad = _gradient(X, Y, w, b)
# gradient descent update
# learning rate decay with time
w = w - learning_rate/np.sqrt(adagrad_step) * w_grad
b = b - learning_rate/np.sqrt(adagrad_step) * b_grad
step += 1
adagrad_step += 1
# Compute loss and accuracy of training set and development set
y_train_pred = _f(X_train, w, b)
Y_train_pred = np.round(y_train_pred)
y_eval_pred = _f(X_eval, w, b)
Y_eval_pred = np.round(y_eval_pred)
acc_train = _accuracy(Y_train_pred, Y_train)
loss_train = _cross_entropy_loss(y_train_pred, Y_train) / train_size
acc_eval = _accuracy(Y_eval_pred, Y_eval)
loss_eval = _cross_entropy_loss(y_eval_pred, Y_eval) / eval_size
if step % 50 == 0 or step == steps:
print(f'Epoch {epoch}/{EPOCH}, step {step}/{steps} : train_loss = {loss_train}, train_acc = {acc_train}, eval_loss = {loss_eval}, eval_acc = {acc_eval}')
train_acc.append(acc_train)
train_loss.append(loss_train)
eval_acc.append(acc_eval)
eval_loss.append(loss_eval)
print('Training loss: {}'.format(train_loss[-1]))
print('Eval loss: {}'.format(eval_loss[-1]))
print('Training accuracy: {}'.format(train_acc[-1]))
print('Eval accuracy: {}'.format(eval_acc[-1]))
# Loss curve
# plt.figure(figsize=(16, 8))
plt.plot(train_loss)
plt.plot(eval_loss)
plt.title('Loss')
plt.legend(['train', 'eval'])
plt.savefig('loss.png')
plt.show()
# Accuracy curve
# plt.figure(figsize=(16, 8))
plt.plot(train_acc)
plt.plot(eval_acc)
plt.title('Accuracy')
plt.legend(['train', 'eval'])
plt.savefig('acc.png')
plt.show()
7.预测
# 输出到output_logistic.csv
# Predict testing labels
predictions = _predict(X_test, w, b)
with open(output_fpath.format('logistic'), 'w') as f:
f.write('id,label\n')
for i, label in enumerate(predictions):
f.write('{},{}\n'.format(i, label))
# Print out the most significant weights
ind = np.argsort(np.abs(w))[::-1]
with open(X_test_fpath) as f:
content = f.readline().strip('\n').split(',')
features = np.array(content)
for i in ind[0:10]:
print(features[i], w[i])