这算是一个很简单的kaggle比赛了,也是非常适合入门的。
直入主题
数据预处理
读入数据后,可以发现有些数据是连续的,有些数据是离散的,有些数据是文本的,有些数据是缺失的。那么很明确,我们需要做的两个工作是,将文本转换成数值数据,这里可以采用one-hot形式,然后再就是将缺失的数据补全。
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
import seaborn as sns
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sns.heatmap(train_data.isnull())
train_data.isnull().sum()
通过seaborn的heatmap可以看到缺失值的一个大致情况。
首先为了减少数据之间的数值差距,首先将数据取均值。然后再把缺失值直接设置成0.这是因为,数据标准化之后,均值消失了,所以设置成0其实就相当于设置成了均值。然后就是使用pandas的get_dummies,将文本数据转换成one-hot。
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
numeric_features = all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x:(x-x.mean())/(x.std())
)
all_features = all_features.fillna(0)
all_features = pd.get_dummies(all_features,dummy_na=True)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)
模型训练
在处理好了数据之后就可以开始构建模型,训练模型了。
net = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features,1),
)
def init_weights(m):
if type(m)==nn.Linear:
nn.init.normal_(m.weight,std=0.01)
net.apply(init_weights)
这里直接使用pytorch构建了线性模型。(这里试过其他的复杂模型,好像效果并没有更好)
loss = nn.MSELoss()
in_features = train_features.shape[1]
这里损失函数采用均方误差,但是在最终的test数据集计算误差时,采用rmse
def log_rmse(net,features,labels):
clipped_preds = torch.clamp(net(features),1,float('inf'))
rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
return rmse.item()
然后就是使用k折交叉验证进行训练
from torch.utils import data
from d2l import torch as d2l
def train(net,train_features,train_labels,test_features,test_labels,
num_epochs,lr,weight_decay,batch_size):
train_ls,test_ls = [] , []
dataset = data.TensorDataset(*(train_features,train_labels))
train_iter = data.DataLoader(dataset,batch_size)
optimizer = torch.optim.Adam(net.parameters(),lr=lr,weight_decay=weight_decay)
for epoch in range(num_epochs):
for X,y in train_iter:
optimizer.zero_grad()
l = loss(net(X),y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net,train_features,train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net,test_features,test_labels))
return train_ls,test_ls
def get_k_fold_data(k,i,X,y):
assert k>1
fold_size = X.shape[0]//k
X_train,y_train = None,None
for j in range(k):
idx = slice(j*fold_size,(j+1)*fold_size)
X_part,y_part = X[idx,:],y[idx]
if j==i:
X_valid,y_valid = X_part,y_part
elif X_train is None:
X_train,y_train = X_part,y_part
else:
X_train = torch.cat([X_train,X_part],0)
y_train = torch.cat([y_train,y_part],0)
return X_train,y_train,X_valid,y_valid
def k_fold(k,X_train,y_train,num_epochs,lr,wd,batch_size):
train_l_sum,valid_l_sum = 0,0
for i in range(k):
data = get_k_fold_data(k,i,X_train,y_train)
train_ls,valid_ls = train(net,*data,num_epochs,lr,wd,batch_size)
train_l_sum+=train_ls[-1]
valid_l_sum+=valid_ls[-1]
if i==0:
d2l.plot(list(range(1,num_epochs+1)),[train_ls,valid_ls],
xlabel='epoch',ylabel='rmse',xlim=[1,num_epochs],
legend=['train','valid'],yscale='log')
print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f},'
f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum/k , valid_l_sum/k
k,num_epochs,lr,wd,batch_size=10,500,20,0.1,300
train_l , valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,wd,batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
f'平均验证log rmse: {float(valid_l):f}')
def train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,
lr,weight_decay,batch_size):
train_l,_ = train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
d2l.plot(np.arange(1,num_epochs+1),[train_l],xlabel='epoch',ylabel='log rmsse',
xlim=[1,num_epochs],yscale='log')
print(f'训练log rmse:{float(train_l[-1]):f}')
preds = net(test_features).detach().numpy()
test_data['SalePrice']=pd.Series(preds.reshape(1,-1)[0])
submission = pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
submission.to_csv('submission.csv',index=False)
train_and_pred(train_features,test_features,train_labels,test_data,
num_epochs,lr,wd,batch_size)
这里调参调了一个还不错的参数,k,num_epochs,lr,wd,batch_size=10,500,20,0.1,300
最终的得分为
以上是一个很简单的线性模型,其实提升最终成绩的方式还有很多,比如说PCA降维,集成学习等等。