import pandas as pd
import numpy as np
from torch import nn, optim
import torch
import matplotlib.pyplot as plt
config = {
'epoch': 10,
'batch_size': 512,
'learning_rate': 8e-3,
'device': 'cuda',
"num_cols": ['regDate', 'creatDate', 'power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7',
'v_8', 'v_9', 'v_10',
'v_11', 'v_12', 'v_13', 'v_14'],
"cate_cols": ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'seller', 'notRepairedDamage']
}
test_data = pd.read_csv(r"C:\Users\52446\Desktop\二手车预测\used_car_testB_20200421.csv", sep=' ')
train_data = pd.read_csv(r"C:\Users\52446\Desktop\二手车预测\used_car_train_20200313.csv", sep=' ')
data = pd.concat([train_data, test_data])
# 定义One-Hot编码函数
def oneHotEncode(df, colNames):
for col in colNames:
dummies = pd.get_dummies(df[col], prefix=col)
df = pd.concat([df, dummies], axis=1)
df.drop([col], axis=1, inplace=True)
return df
data = data.replace('-', '-1')
data.notRepairedDamage = data.notRepairedDamage.astype('float32')
data.loc[data['power'] > 600, 'power'] = 600
# 处理离散数据
for col in config['cate_cols']:
data[col] = data[col].fillna('-1')
data = oneHotEncode(data, config['cate_cols'])
# 处理连续数据
for col in config['num_cols']:
data[col] = data[col].fillna(0)
data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
# 处理(可能)无关数据
data.drop(['name', 'regionCode'], axis=1, inplace=True)
# 暂存处理后的test数据集
test_data = data[pd.isna(data.price)]
test_data.to_csv('./one_hot_testB.csv')
# 删除test数据(price is nan)
data.reset_index(inplace=True)
train_data = data
train_data = train_data.drop(data[pd.isna(data.price)].index)
# 删除ID
train_data.drop(['SaleID'], axis=1, inplace=True)
# 打乱
train_data = train_data.sample(frac=1)
# 分离目标
train_target = train_data['price']
train_data.drop(['price', 'index'], axis=1, inplace=True)
# 分离出验证集,用于观察拟合情况
validation_data = train_data[:10000]
train_data = train_data[10000:]
validation_target = train_target[:10000]
train_target = train_target[10000:]
# 定义网络结构
class Network(nn.Module):
def __init__(self, in_dim, hidden_1, hidden_2, hidden_3, hidden_4):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(in_dim, hidden_1),
nn.BatchNorm1d(hidden_1),
nn.ReLU(),
nn.Linear(hidden_1, hidden_2),
nn.BatchNorm1d(hidden_2),
nn.ReLU(),
nn.Linear(hidden_2, hidden_3),
nn.BatchNorm1d(hidden_3),
nn.ReLU(),
nn.Linear(hidden_3, hidden_4),
nn.BatchNorm1d(hidden_4),
nn.ReLU(),
nn.Linear(hidden_4, 1)
)
def forward(self, x):
y = self.layers(x)
return y
# 定义网络
model = Network(train_data.shape[1], 256, 256, 256, 32)
model.to(config['device'])
# 使用Xavier初始化权重
for line in model.layers:
if type(line) == nn.Linear:
nn.init.xavier_uniform_(line.weight)
# 将数据转化为tensor,并移动到cpu或cuda上
train_features = torch.tensor(train_data.values, dtype=torch.float32, device=config['device'])
train_num = train_features.shape[0]
train_labels = torch.tensor(train_target.values, dtype=torch.float32, device=config['device'])
validation_features = torch.tensor(validation_data.values, dtype=torch.float32, device=config['device'])
validation_num = validation_features.shape[0]
validation_labels = torch.tensor(validation_target.values, dtype=torch.float32, device=config['device'])
# 定义损失函数和优化器
criterion = nn.MSELoss()
criterion.to(config['device'])
optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
# 开始训练
mae_list = []
for epoch in range(config['epoch']):
losses = []
model.train()
for i in range(0, train_num, config['batch_size']):
end = i + config['batch_size']
if i + config['batch_size'] > train_num - 1:
end = train_num - 1
mini_batch = train_features[i: end]
mini_batch_label = train_labels[i: end]
pred = model(mini_batch)
pred = pred.squeeze()
loss = criterion(pred, mini_batch_label)
if torch.isnan(loss):
break
mae = torch.abs(mini_batch_label - pred).sum() / (end - i)
losses.append(mae.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
model.eval()
pred = model(validation_features)
validation_mae = torch.abs(validation_labels - pred.squeeze()).sum().item() / validation_num
mae_list.append((sum(losses) / len(losses), validation_mae))
print(f"epoch:{epoch + 1} MAE: {sum(losses) / len(losses)}, Validation_MAE: {validation_mae}")
torch.save(model, 'model.pth')
x = np.arange(0, config['epoch'])
y1, y2 = zip(*mae_list)
plt.plot(x, y1, label='train')
plt.plot(x, y2, label='valid')
plt.legend()
plt.show()
将最后得到的数据放入文件中即可提交