简介
kaggle是一个供机器学习爱好者交流的平台。
该例子网址:https://www.kaggle.com/c/house-prices-advanced-regression-techniques
比赛数据集可以通过点击“Data”来获取
代码
供参考代码:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import d2lzh_pytorch as d2l
torch.set_default_tensor_type(torch.FloatTensor)
train_data = pd.read_csv('./data/kaggle_house/train.csv')
test_data = pd.read_csv('./data/kaggle_house/test.csv')
print(train_data.shape)
print(test_data.shape)#test与train相比,少了一个标签;而我们需要设计模型,将那个标签给预测出来
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index #正好将特征名给提取出来了
all_features[numeric_features] = all_features[numeric_features].apply( lambda x: (x - x.mean()) / (x.std()))
#标准化后,所有特征的均值为0,所以将缺失的特征值设为0
all_features = all_features.fillna(0)
#dummy_na=True将缺失特征值也当做合法的特征值,并转换指示特征
all_features = pd.get_dummies(all_features, dummy_na=True)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice