保存并读取数据集
第一步:保存kaggle上的数据集到本地
注册kaggle账号无法显示验证码:https://blog.azurezeng.com/recaptcha-use-in-china/
下载kaggle数据集到本地;https://zhuanlan.zhihu.com/p/266570781?ivk_sa=1024320u&utm_id=0
第二步:读取本地保存的数据集
使用pandas读取csv文件并处理
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils import data
train_data=pd.read_csv(r"C:\Users\Lenovo\Kaggle\house-prices-advanced-regression-techniques\train.csv")
test_data=pd.read_csv(r"C:\Users\Lenovo\Kaggle\house-prices-advanced-regression-techniques\test.csv")
print(type(train_data))
print(train_data.shape)
print(test_data.shape)
<class 'pandas.core.frame.DataFrame'>
(1460, 81)
(1459, 80)
print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
#查看一下train_data的前四个特征与最后两个特征,以及房价
Id MSSubClass MSZoning LotFrontage SaleType SaleCondition SalePrice
0 1 60 RL 65.0 WD Normal 208500
1 2 20 RL 80.0 WD Normal 181500
2 3 60 RL 68.0 WD Normal 223500
3 4 70 RL 60.0 WD Abnorml 140000
对train_data的特征进行观察发现第一列的Id数据不应该被认为是特征
使用pd.iloc对数据进行选择:https://blog.csdn.net/Penguin_zlh/article/details/106296356
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
数据预处理
首先对数字类型进行插值,将缺失的值补为正常的数值
all_features.dtypes
#得到每一列的数据类型
MSSubClass int64
MSZoning object
LotFrontage float64
LotArea int64
Street object
...
MiscVal int64
MoSold int64
YrSold int64
SaleType object
SaleCondition object
Length: 79, dtype: object
all_features.dtypes[all_features.dtypes!='object']
MSSubClass int64
LotFrontage float64
LotArea int64
OverallQual int64
OverallCond int64
YearBuilt int64
YearRemodAdd int64
MasVnrArea float64
BsmtFinSF1 float64
BsmtFinSF2 float64
BsmtUnfSF float64
TotalBsmtSF float64
1stFlrSF int64
2ndFlrSF int64
LowQualFinSF int64
GrLivArea int64
BsmtFullBath float64
BsmtHalfBath float64
FullBath int64
HalfBath int64
BedroomAbvGr int64
KitchenAbvGr int64
TotRmsAbvGrd int64
Fireplaces int64
GarageYrBlt float64
GarageCars float64
GarageArea float64
WoodDeckSF int64
OpenPorchSF int64
EnclosedPorch int64
3SsnPorch int64
ScreenPorch int64
PoolArea int64
MiscVal int64
MoSold int64
YrSold int64
dtype: object
对数字类型的数据进行标准化计算,从而对缺失的值补为0
numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean()/x.std()))
all_features[numeric_features]=all_features[numeric_features].fillna(0)
其次对非数字类型进行独热编码:https://deepinout.com/pandas/pandas-questions/131_pandas_how_can_i_one_hot_encode_in_python.html
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features=pd.get_dummies(all_features,dummy_na=True)
print(all_features.shape)
(2919, 331)
print(all_features)
MSSubClass LotFrontage LotArea OverallQual OverallCond \
0 58.65614 62.031224 8448.710775 2.681348 0.000967
1 18.65614 77.031224 9598.710775 1.681348 3.000967
2 58.65614 65.031224 11248.710775 2.681348 0.000967
3 68.65614 57.031224 9548.710775 2.681348 0.000967
4 58.65614 81.031224 14258.710775 3.681348 0.000967
... ... ... ... ... ...
1454 158.65614 18.031224 1934.710775 -0.318652 2.000967
1455 158.65614 18.031224 1892.710775 -0.318652 0.000967
1456 18.65614 157.031224 19998.710775 0.681348 2.000967
1457 83.65614 59.031224 10439.710775 0.681348 0.000967
1458 58.65614 71.031224 9625.710775 2.681348 0.000967
YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 ... \
0 1937.92179 1908.033417 195.430107 705.03114 -0.29303 ...
1 1910.92179 1881.033417 -0.569893 977.03114 -0.29303 ...
2 1935.92179 1907.033417 161.430107 485.03114 -0.29303 ...
3 1849.92179 1875.033417 -0.569893 215.03114 -0.29303 ...
4 1934.92179 1905.033417 349.430107 654.03114 -0.29303 ...
... ... ... ... ... ... ...
1454 1904.92179 1875.033417 -0.569893 -0.96886 -0.29303 ...
1455 1904.92179 1875.033417 -0.569893 251.03114 -0.29303 ...
1456 1894.92179 1901.033417 -0.569893 1223.03114 -0.29303 ...
1457 1926.92179 1897.033417 -0.569893 336.03114 -0.29303 ...
1458 1927.92179 1899.033417 93.430107 757.03114 -0.29303 ...
SaleType_Oth SaleType_WD SaleType_nan SaleCondition_Abnorml \
0 0 1 0 0
1 0 1 0 0
2 0 1 0 0
3 0 1 0 1
4 0 1 0 0
... ... ... ... ...
1454 0 1 0 0
1455 0 1 0 1
1456 0 1 0 1
1457 0 1 0 0
1458 0 1 0 0
SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family \
0 0 0 0
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
... ... ... ...
1454 0 0 0
1455 0 0 0
1456 0 0 0
1457 0 0 0
1458 0 0 0
SaleCondition_Normal SaleCondition_Partial SaleCondition_nan
0 1 0 0
1 1 0 0
2 1 0 0
3 0 0 0
4 1 0 0
... ... ... ...
1454 1 0 0
1455 0 0 0
1456 0 0 0
1457 1 0 0
1458 1 0 0
[2919 rows x 331 columns]
将DataFrame格式转换为Numpy格式
n_train=train_data.shape[0]
train_features=torch.tensor(all_features.iloc[:n_train,:].values,dtype=torch.float32)
test_features=torch.tensor(all_features.iloc[n_train:,:].values,dtype=torch.float32)
train_labels=torch.tensor(train_data["SalePrice"].values.reshape(-1,1),dtype=torch.float32)
print(train_features.shape)
print(test_features.shape)
print(train_labels.shape)
torch.Size([1460, 331])
torch.Size([1459, 331])
torch.Size([1460, 1])
开始训练
loss=nn.MSELoss()
n_features=train_features.shape[1]
def get_net():
net=nn.Sequential(nn.Linear(n_features,1))
return net
第一步:使用TensorDataset将张量类型的train_features与train_labels进行打包,注意第一维度必须一致:https://blog.csdn.net/anshiquanshu/article/details/109398797
第二步:使用DataLoader得到迭代器
def train_and_pred(train_features,train_labels,test_features,
num_epochs,learning_rate,weight_decay,batch_size):
net=get_net()
dataset=data.TensorDataset(train_features,train_labels)
train_iter=data.DataLoader(dataset,batch_size)
optimizer=torch.optim.Adam(net.parameters(),
lr=learning_rate,
weight_decay=weight_decay)
for epoch in range(num_epochs):
for X,y in train_iter:
optimizer.zero_grad()
l=loss(net(X),y)
l.backward()
optimizer.step()
preds = net(test_features).detach()
return preds
num_epochs,lr,wd,bs=100,0.1,0,64
results= train_and_pred(train_features,train_labels,test_features,num_epochs,lr,wd,bs)
print(results)
tensor([[147158.1250],
[178364.9062],
[194561.8125],
...,
[200548.2969],
[102706.2031],
[231744.9531]])