动手学习深度学习9-二手车交易kaggle竞赛

一. 理解数据

1、导入数据

train_df = pd.read_csv(config['train_path'],sep=' ')
test_df = pd.read_csv(config['test_path'],sep=' ')
df = pd.concat([train_df,test_df],axis=0).reset_index(drop=True)

2、分析数据

1)power:发动机功率:范围 [ 0, 600 ]

#将超出范围的处理为600
df.loc[df['power']>600,'power'] = 600

2)处理nan值

df[col] = df[col].fillna('-1')

3)连续特征标准化

#连续特征
for col in config['num_cols']:
    df[col] = df[col].fillna(0)
    df[col] = (df[col]-df[col].min()) / (df[col].max()-df[col].min())

4)较大的值进行log变换

train_df['price'] = np.log(train_df['price'])

5)绘制密度图、频次图等

# 离散特征
for col in config['cate_cols']:
    # 统计特征频次
    counts = df[col].value_counts()

    # 绘制条形图
    counts.plot(kind='bar')

    # 设置图形标题和标签
    plt.title(f'{col} Frequencies')
    plt.xlabel(col)
    plt.ylabel('Frequency')

    # 显示图形
    plt.show()

3、批量读取数据

注意事项:

  1. 理解数据原始形式,比如,数据是有什么分隔的,数据的字段类型等
  2. 理解数据编码方式,比如连续还是离散。
  3. 理解数据字段,比如哪些是特征列,哪些是预测列
  4. 理解如何进行数据IO
#Dataset构造,继承dataset
class SaleDataset(Dataset):
    def __init__(self,df,cate_cols,num_cols):
        self.df = df
        self.feature_name = cate_cols + num_cols
            
    def __getitem__(self, index):
        data = dict()
        ### 特征列处理
        for col in self.feature_name:
            data[col] = torch.Tensor([self.df[col].iloc[index]]).squeeze(-1)
        ### 预测列
        if 'price' in self.df.columns:
            data['price'] = torch.Tensor([self.df['price'].iloc[index]]).squeeze(-1)
        return data

    def __len__(self):
        return len(self.df)
    
def get_dataloader(df, cate_cols ,num_cols, batch_size=256, num_workers=2, shuffle=True):
    dataset = SaleDataset(df,cate_cols,num_cols)
    dataloader = D.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    return dataloader

4、定义模型

可以将模型拆解为各个子模块,单独定义,最后整合

  1. embedding层,对离散特征进行处理
# Embedding层:用于对离散特征进行编码映射
class EmbeddingLayer(nn.Module):
    def __init__(self,
                 vocab_map = None,
                 embedding_dim = None):
        super(EmbeddingLayer, self).__init__()
        self.vocab_map = vocab_map
        self.embedding_dim = embedding_dim
        self.embedding_layer = nn.ModuleDict()

        self.emb_feature = []
        # 使用字典来存储每个离散特征的Embedding标
        for col in self.vocab_map.keys():
            self.emb_feature.append(col)
            self.embedding_layer.update({col : nn.Embedding(
                self.vocab_map[col]['vocab_size'],
                self.embedding_dim,
            )})

    def forward(self, X):
        #对所有的sparse特征挨个进行embedding
        feature_emb_list = []
        for col in self.emb_feature:
            inp = X[col].long().view(-1, 1)
            feature_emb_list.append(self.embedding_layer[col](inp))
        return torch.cat(feature_emb_list,dim=1)
  1. 定义MLP层
#MLP
class MLP(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim=None,
                 hidden_units=[],
                 hidden_activations="ReLU",
                 final_activation=None,
                 dropout_rates=0,
                 batch_norm=False,
                 use_bias=True):
        super(MLP, self).__init__()
        dense_layers = []
        if not isinstance(dropout_rates, list):
            dropout_rates = [dropout_rates] * len(hidden_units)
        if not isinstance(hidden_activations, list):
            hidden_activations = [hidden_activations] * len(hidden_units)
        hidden_activations = [self.set_activation(x) for x in hidden_activations]
        hidden_units = [input_dim] + hidden_units
        for idx in range(len(hidden_units) - 1):
            dense_layers.append(nn.Linear(hidden_units[idx], hidden_units[idx + 1], bias=use_bias))
            if batch_norm:
                dense_layers.append(nn.BatchNorm1d(hidden_units[idx + 1]))
            if hidden_activations[idx]:
                dense_layers.append(hidden_activations[idx])
            if dropout_rates[idx] > 0:
                dense_layers.append(nn.Dropout(p=dropout_rates[idx]))
        if output_dim is not None:
            dense_layers.append(nn.Linear(hidden_units[-1], output_dim, bias=use_bias))
        if final_activation is not None:
            dense_layers.append(set_activation(final_activation))
        self.dnn = nn.Sequential(*dense_layers)  # * used to unpack list
    
    def set_activation(self,activation):
        if isinstance(activation, str):
            if activation.lower() == "relu":
                return nn.ReLU()
            elif activation.lower() == "sigmoid":
                return nn.Sigmoid()
            elif activation.lower() == "tanh":
                return nn.Tanh()
            else:
                return getattr(nn, activation)()
        else:
            return activation

    def forward(self, inputs):
        return self.dnn(inputs)
  1. 整合整个模型
class SaleModel(nn.Module):
    def __init__(self,
                is_use_cate_cols = True,
                vocab_map = None,
                embedding_dim = 16,
                num_cols = None,
                cate_cols = None,
                hidden_units = [256,128,64,32],
                loss_fun = 'nn.L1Loss()'):
        super(SaleModel, self).__init__()
        self.is_use_cate_cols = is_use_cate_cols
        self.vocab_map = vocab_map
        self.embedding_dim = embedding_dim
        self.num_cols = num_cols
        self.num_nums_fea = len(num_cols)
        self.hidden_units = hidden_units
        self.loss_fun = eval(loss_fun) # self.loss_fun  = nn.L1Loss()
        ### 如果是离散值,进行embedding
        if is_use_cate_cols:
            self.emb_layer = EmbeddingLayer(vocab_map=vocab_map,embedding_dim=embedding_dim)
            self.mlp = MLP(
                 self.num_nums_fea + self.embedding_dim*len(vocab_map),
                 output_dim=1,
                 hidden_units=self.hidden_units,
                 hidden_activations="ReLU",
                 final_activation=None,
                 dropout_rates=0,
                 batch_norm=True,
                 use_bias=True)
        else:
            self.mlp = MLP(
                 self.num_nums_fea,
                 output_dim=1,
                 hidden_units=self.hidden_units,
                 hidden_activations="ReLU",
                 final_activation=None,
                 dropout_rates=0,
                 batch_norm=True,
                 use_bias=True)
        self.apply(self._init_weights)
        ### 使用xavier初始化权重
    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            xavier_normal_(module.weight.data)
        elif isinstance(module, nn.Linear):
            xavier_normal_(module.weight.data)
        
    def get_dense_input(self, data):
        dense_input = []
        for col in self.num_cols:
            dense_input.append(data[col])
        return torch.stack(dense_input,dim=-1)
        ### 前向传播
    def forward(self,data):
        dense_fea = self.get_dense_input(data) # [batch,num_nums_cols]
        if self.is_use_cate_cols:
            sparse_fea = self.emb_layer(data) # [batch,num_cate_cols,emb]
            sparse_fea = torch.flatten(sparse_fea,start_dim=1) # [batch,num_cate_cols*emb]
            mlp_input = torch.cat([sparse_fea, dense_fea],axis=-1) # [batch,num_nums_cols+num_cate_cols*emb]
        else:
            mlp_input = dense_fea
            
        y_pred = self.mlp(mlp_input)
        # 为了把复杂多变的loss计算对外不感知,所以写在forward里面
        if 'price' in data.keys():
            loss = self.loss_fun(y_pred.squeeze(),data['price'])
            output_dict = {'pred':y_pred,'loss':loss}
        else:
            output_dict = {'pred':y_pred}
        return output_dict

5、pipeline定义

训练pipeline

#训练模型,验证模型,这里就是八股文,熟悉基础pipeline
def train_model(model, train_loader, optimizer, device, metric_list=['mean_absolute_error']):
    model.train()
    pred_list = []
    label_list = []
    max_iter = int(train_loader.dataset.__len__() / train_loader.batch_size)
    for idx,data in enumerate(train_loader):
        # 把数据拷贝在指定的device
        for key in data.keys():
            data[key] = data[key].to(device)
       # 模型前向+Loss计算
        output = model(data)
        pred = output['pred']
        loss = output['loss']
        # 八股文完成模型权重更新
        loss.backward()
        optimizer.step()
        model.zero_grad()

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['price'].squeeze(-1).cpu().detach().numpy())
        
        if idx%50==0:
            logger.info(f"Iter:{idx}/{max_iter} Loss:{round(loss.item(),4)}")
        
    res_dict = dict()
    for metric in metric_list:
        res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

验证pipeline


def valid_model(model, valid_loader, device, metric_list=['mean_absolute_error']):
    model.eval()
    pred_list = []
    label_list = []

    for data in (valid_loader):
        # 把数据拷贝在指定的device
        for key in data.keys():
            data[key] = data[key].to(device)
        # 模型前向
        output = model(data)
        pred = output['pred']

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['price'].squeeze(-1).cpu().detach().numpy())
    
    res_dict = dict()
    for metric in metric_list:
        res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

测试pipeline

def test_model(model, test_loader, device):
    model.eval()
    pred_list = []

    for data in test_loader:
        # 把数据拷贝在指定的device
        for key in data.keys():
            data[key] = data[key].to(device)
        # 模型前向
        output = model(data)
        pred = output['pred']
        pred_list.extend(pred.squeeze().cpu().detach().numpy())

    return np.array(pred_list)

6、模型训练+K折交叉验证

test_loader = get_dataloader(test_df, config['cate_cols'] ,config['num_cols'], batch_size=config['batch_size'], num_workers=0, shuffle=False)
    

n_fold = 5
oof_pre = np.zeros(len(train_df))
y_pre = np.zeros(len(test_df))
device = torch.device(config['device'])

kf = KFold(n_splits=n_fold)
for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_df)):
    logger.info(f"Fold {fold_+1}")
    temp_train_df = train_df.iloc[trn_idx].reset_index(drop=True)
    temp_valid_df = train_df.iloc[val_idx].reset_index(drop=True)
    
    train_loader = get_dataloader(temp_train_df, config['cate_cols'] ,config['num_cols'], batch_size=config['batch_size'], num_workers=4, shuffle=True)
    valid_loader = get_dataloader(temp_valid_df, config['cate_cols'] ,config['num_cols'], batch_size=config['batch_size'], num_workers=0, shuffle=False)
    #声明模型
    model = SaleModel(**model_config)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
    #声明Trainer
    for epoch in range(config['epoch']):
        #模型训练
        logger.info(f"Start Training Epoch:{epoch+1}")
        train_metirc = train_model(model,train_loader,optimizer=optimizer,device=device)
        logger.info(f"Train Metric: {train_metirc}")
        #模型验证
        valid_metric = valid_model(model,valid_loader,device)
        logger.info(f"Valid Metric: {valid_metric}")
    #保存模型权重和enc_dict
    save_dict = {'model': model.state_dict()}
    torch.save(save_dict, os.path.join(config['model_ckpt_dir'], f'model_{fold_}.pth'))
    # oof推理
    oof_pre[val_idx] = test_model(model, valid_loader, device=device)
    # 测试集推理
    y_pre += np.array(test_model(model, test_loader, device=device)) / n_fold
    
# 实际价格的预测结果
oof_pre_ori = np.exp(oof_pre)
price_ori = np.exp(train_df['price'])
mean_absolute_error(price_ori,oof_pre_ori)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值