先从Kaggle(⽹站地址:https://www.kaggle.com )下载数据集(注册账号需要翻墙)
1.获取和读取数据集
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import numpy as np
import sys
import pandas as pd
train_data = pd.read_csv('G:/代码/pycharm/data/train.csv')
test_data = pd.read_csv('G:/代码/pycharm/data/test.csv')
print(test_data.shape)
2.数据预处理(初始化其中的空值)
对连续数值的特征做标准化(standardization):将该特征的每个值先减去 µ(均值) 再除以 σ(标准差) 得到标准化后的每个特征值。
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))
all_features=all_features.fillna(all_features.mean())
all_features=pd.get_dummies(all_features,dummy_na=True)#将离散值转换成指示特征
n_train = train_data.shape[0]
train_features = nd.array(all_features[:n_train].values)
test_features = nd.array(all_features[n_train:].values)#转换成array,方便训练
train_labels = nd.array(train_data.SalePrice.values).reshape((-1, 1))
注:mean()函数:求平均值 std()函数:求标准差(方差的算术平方根)
3.训练模型
使用L2范数来定义损失函数
loss = gloss.L2Loss()
def get_net():
net = nn.Sequential() #Sequential实例:⼀个串联各个层的容器
net.add(nn.Dense(1)) #Dense实例:全连接层
net.initialize()
return net
定义比赛中用来评价模型的对数平方根
def log_rmse(net, train_features, train_labels):
# 将⼩于 1 的值设成 1,使得取对数时数值更稳定。
clipped_preds = nd.clip(net(train_features), 1, float('inf'))
rmse = nd.sqrt(2 * loss(clipped_preds.log(), train_labels.log()).mean())
return rmse.asscalar()#返回标量
训练方法和以前的一样
def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True)
# 这⾥使⽤了 Adam 优化算法。
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd': weight_decay})
for epoch in range(num_epochs):
for X, y in train_iter:
with auto.record():
l = loss(net(X), y)
l.backward()
trainer.step(batch_size)
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
4.使用K-折交叉验证
def get_k_fold_data(k,i,X,y):
"""
获取K折中第i次为测试数据的样本
:param k: 把样本分为K份
:param i: 第i份为检验数据
:param X: 原始数据X
:param y: 原始数据y
:return: 用于训练的样本和测试的样本
"""
assert k>1
fold_size=X.shape[0]//k
X_train,y_train=None,None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
x_part, y_part = X[idx, :], y[idx]
if j==i:
X_vaild=x_part
y_vaild=y_part
elif X_train is None:
X_train=x_part
y_train=y_part
else:
X_train=nd.concat(X_train,x_part,dim=0)
y_train=nd.concat(y_train,y_part,dim=0)
return X_train,y_train,X_vaild,y_vaild
首先获取第i次进行k折交叉验证的数据
进行k-折交叉验证:
def k_fold(k, X_train, y_train, num_epochs,learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
if i == 0:
gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',range(1, num_epochs + 1), valid_ls,['train', 'valid'])
print('fold %d, train rmse: %f, valid rmse: %f' % (i, train_ls[-1], valid_ls[-1]))
return train_l_sum / k, valid_l_sum / k
5.初始化参数开始训练
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
verbose_epoch = num_epochs - 2
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)
print('%d-fold validation: avg train rmse: %f, avg valid rmse: %f'% (k, train_l, valid_l))
最后查看输出结果比较
最后使⽤完整的训练数据集来重新训练模型,并将预测结果存成提交所需要的格式
def train_and_pred(train_features, test_feature, train_labels, test_data,num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size)
gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')#绘图
print('train rmse %f' % train_ls[-1])
preds = net(test_features).asnumpy()
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
最后加上全部的源码
import gluonbook as gb
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import numpy as np
import sys
import pandas as pd
def get_net():
net = nn.Sequential() #Sequential实例:⼀个串联各个层的容器
net.add(nn.Dense(1)) #Dense实例:全连接层
net.initialize()
return net
def log_rmse(net, train_features, train_labels):
# 将⼩于 1 的值设成 1,使得取对数时数值更稳定。
clipped_preds = nd.clip(net(train_features), 1, float('inf'))
rmse = nd.sqrt(2 * loss(clipped_preds.log(), train_labels.log()).mean())
return rmse.asscalar()
def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels), batch_size, shuffle=True)
# 这⾥使⽤了 Adam 优化算法。
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': learning_rate, 'wd': weight_decay})
for epoch in range(num_epochs):
for X, y in train_iter:
with autograd.record():
l = loss(net(X), y)
l.backward()
trainer.step(batch_size)
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
def get_k_fold_data(k,i,X,y):
"""
获取K折中第i次为测试数据的样本
:param k: 把样本分为K份
:param i: 第i份为检验数据
:param X: 原始数据X
:param y: 原始数据y
:return: 用于训练的样本和测试的样本
"""
assert k>1
fold_size=X.shape[0]//k
X_train,y_train=None,None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
x_part, y_part = X[idx, :], y[idx]
if j==i:
X_vaild=x_part
y_vaild=y_part
elif X_train is None:
X_train=x_part
y_train=y_part
else:
X_train=nd.concat(X_train,x_part,dim=0)
y_train=nd.concat(y_train,y_part,dim=0)
return X_train,y_train,X_vaild,y_vaild
def k_fold(k, X_train, y_train, num_epochs,learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
# if i == 0:
# gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',range(1, num_epochs + 1), valid_ls,['train', 'valid'])
print('fold %d, train rmse: %f, valid rmse: %f' % (i, train_ls[-1], valid_ls[-1]))
return train_l_sum / k, valid_l_sum / k
def train_and_pred(train_features, test_feature, train_labels, test_data,num_epochs, lr, weight_decay, batch_size):
net = get_net()
train_ls, _ = train(net, train_features, train_labels, None, None,num_epochs, lr, weight_decay, batch_size)
gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')#绘图
print('train rmse %f' % train_ls[-1])
preds = net(test_features).asnumpy()
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
train_data = pd.read_csv('G:/代码/pycharm/data/train.csv')
test_data = pd.read_csv('G:/代码/pycharm/data/test.csv')
# print(test_data.shape)
# print( train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
# print(all_features.dtypes)
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))
all_features=all_features.fillna(all_features.mean())
all_features=pd.get_dummies(all_features,dummy_na=True)
n_train = train_data.shape[0]
train_features = nd.array(all_features[:n_train].values)
test_features = nd.array(all_features[n_train:].values)
train_labels = nd.array(train_data.SalePrice.values).reshape((-1, 1))
#
# print(train_features.shape)
# print(train_labels.shape)
loss=gloss.L2Loss()
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
verbose_epoch = num_epochs - 2
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,weight_decay, batch_size)
print('%d-fold validation: avg train rmse: %f, avg valid rmse: %f'% (k, train_l, valid_l))
train_and_pred(train_features, test_features, train_labels, test_data,num_epochs, lr, weight_decay, batch_size)