年收入二分类
import numpy as np
import os
prefix=“D:\xcx\df_finance\lhy\hw2”
train_x=“hw2\data\X_train”
train_y=“hw2\data\Y_train”
test_x=“hw2\data\X_test”
os.path.abspath(“./data/X_train”)
数据 归一化 划分训练集和验证集 数据洗牌
with open(train_x) as f:
next(f)
X_train=np.array([line.strip("\n").split(",") for line in f],dtype='float')
with open(train_y) as f:
next(f)
Y_train=np.array([line.strip("\n").split(",")[1] for line in f],dtype='float')
with open(test_x) as f:
next(f)
X_test=np.array([line.strip("\n").split(",") for line in f],dtype='float')
np.mean(X_train,axis=0).reshape(1,-1).shape # 变成2dim 广播运算
Y_train.shape
def _normalize(X,specified_columns=None,train=True,X_mean=None,X_std=None):
if specified_columns==None:
specified_columns=np.arange(X.shape[1])
# specified_columns=np.arange(len(X))
if train:
X_mean=np.mean(X,axis=0).reshape(1,-1)
X_std=np.std(X,axis=0).reshape(1,-1)
X[:,specified_columns]=(X[:,specified_columns]-X_mean)/(X_std+1e-8)
return X,X_mean,X_std
def train_dev_split(X,Y,dev_ratio=0.2):
train_len=int((len(X))*(1-dev_ratio))
# np array可以直接索引第一维度
train_x,train_y = X[train_len:],Y[train_len:]
dev_x,dev_y = X[train_len:,:],Y[train_len:]
return train_x,train_y,dev_x,dev_y
# train归一化后计算的mean std 用于计算test
X_train,X_mean,X_std = _normalize(X_train)
X_test,X_mean,X_std=_normalize(X_test,train=False,X_mean=X_mean,X_std=X_std)
trainx,trainy,devx,devy=train_dev_split(X_train,Y_train,0.2)
X_train.shape
建立模型
定义forward函数、还是sigmod(logits回归)
# 计算技巧 准确度
def _sigmod(z):
p = np.clip(1/(1+np.exp(-z)),1e-8,1-1e-8)# 认为sigmod它不会达到01 因此用夹到可以识别的小数区间
return p
def _f(X,w,b):# 不是隐式计算和更新w b
# print(np.matmul(X,w).shape)
# 32,511; 511,+1,
return _sigmod(np.matmul(X,w)+b)
def _predit(y_pred):
return np.round(Y_pred)
def _acc(y_pred,Y_label):# 计算准确率
return 1-np.mean(np.abs(Y_label-y_pred))
#调试信息
w = np.zeros(trainx.shape[1],)
b = np.zeros((1,))
_f(trainx,w,b)
# 决策函数 计算技巧 交叉熵
def _loss(y_pred,Y_label):
# 两个向量相乘 内积
loss=-np.dot(Y_label,np.log(y_pred))-np.dot((1-Y_label),np.log(1-y_pred))
# print(loss.shape)
return loss
_loss(_f(trainx,w,b),trainy)
#计算梯度值 对交叉熵计算梯度 公式推导见笔记
def _gradient(X, Y_label, w, b):
# This function computes the gradient of cross entropy loss with respect to weight w and bias b.
y_pred = _f(X, w, b)
pred_error = Y_label - y_pred
w_grad = -np.sum(pred_error * X.T, 1)
b_grad = -np.sum(pred_error)
return w_grad, b_grad
_gradient(trainx,trainy,w,b)
模型训练
dev_acc=[]
dev_loss=[]
train_acc=[]
train_loss=[]
def _train(X,Y,X_dev,Y_dev,batch_size,epoches,lr):
w = np.zeros((X.shape[1],))
b = np.zeros((1,))
idxs = np.floor(len(X)/batch_size).astype(np.int)
step = 1
for epoch in range(epoches):
#X,Y = _shuffle(X),_shuffle(Y)
for idx in range(idxs):
# 直接对行索引
x,y=X[idx*batch_size:(idx+1)*batch_size],Y[idx*batch_size:(idx+1)*batch_size]
#loss=_loss(_f(x),y)
_grad=_gradient(X,Y,w,b)
w=w-lr/np.sqrt(step)*_grad[0]
b=b-lr/np.sqrt(step)*_grad[1]
step = step+1
y_pred=_f(X,w,b)
train_acc.append( _acc(np.round(y_pred),Y))
train_loss.append(_loss(y_pred,Y))
y_pred=_f(X_dev,w,b)
dev_acc.append( _acc(np.round(y_pred),Y_dev))
dev_loss.append(_loss(y_pred,Y_dev))
_train(trainx,trainy,devx,devy,10,10,0.2)
Loss曲线
import matplotlib.pyplot as plt
plt.plot(train_loss)
plt.plot(dev_loss)
plt.title('Loss')
plt.legend(['train', 'dev'])
plt.savefig('loss.png')
plt.show()
# Accuracy曲线
plt.plot(train_acc)
plt.plot(dev_acc)
plt.title('Accuracy')
plt.legend(['train', 'dev'])
plt.savefig('acc.png')
plt.show()
# 一定要注意各种shape 放小参数调试