Datawhale AI 夏令营 siRNA药物药效预测 task02

1.完整代码

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

# 数据加载和合并
df_original = pd.read_csv("train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)

# 特征构建函数
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
    name = "anti" if anti else "sense"
    df = s.to_frame()
    df[f"feat_siRNA_{name}_seq_len"] = s.str.len()
    
    nucleotides = "AUGC"
    for pos in [0, -1]:
        for c in nucleotides:
            df[f"feat_siRNA_{name}_seq_{c}_{'front' if pos == 0 else 'back'}"] = (s.str[pos] == c)
    
    patterns = [
        ("AA", "UU"), ("GA", "UU"), ("CA", "UU"), ("UA", "UU"),
        ("UU", "AA"), ("UU", "GA"), ("UU", "CA"), ("UU", "UA")
    ]
    
    for i, (start, end) in enumerate(patterns, 1):
        df[f"feat_siRNA_{name}_seq_pattern_{i}"] = s.str.startswith(start) & s.str.endswith(end)
        
    df[f"feat_siRNA_{name}_seq_pattern_9"] = s.str[1] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_10"] = s.str[-2] == "A"
    df[f"feat_siRNA_{name}_seq_pattern_GC_frac"] = (s.str.count("G") + s.str.count("C")) / s.str.len()
    
    return df.iloc[:, 1:]

# One-Hot 编码函数
def get_dummies_with_prefix(df, column, prefix):
    dummies = pd.get_dummies(df[column], prefix=f"feat_{prefix}")
    return dummies

# 特征处理
df_publication_id = get_dummies_with_prefix(df, 'publication_id', 'publication_id')
df_gene_target_symbol_name = get_dummies_with_prefix(df, 'gene_target_symbol_name', 'gene_target_symbol_name')
df_gene_target_ncbi_id = get_dummies_with_prefix(df, 'gene_target_ncbi_id', 'gene_target_ncbi_id')
df_gene_target_species = get_dummies_with_prefix(df, 'gene_target_species', 'gene_target_species')
df_cell_line_donor = get_dummies_with_prefix(df, 'cell_line_donor', 'cell_line_donor')
df_Transfection_method = get_dummies_with_prefix(df, 'Transfection_method', 'Transfection_method')
df_Duration_after_transfection_h = get_dummies_with_prefix(df, 'Duration_after_transfection_h', 'Duration_after_transfection_h')

siRNA_duplex_id_values = df.siRNA_duplex_id.str[3:-2].str.strip(".").astype("int")
siRNA_duplex_id_values = (siRNA_duplex_id_values - siRNA_duplex_id_values.min()) / (
    siRNA_duplex_id_values.max() - siRNA_duplex_id_values.min()
)
df_siRNA_duplex_id = pd.DataFrame(siRNA_duplex_id_values, columns=['feat_siRNA_duplex_id_normalized'])

df_siRNA_concentration = df.siRNA_concentration.to_frame(name='feat_siRNA_concentration')

# 合并所有特征
feats = pd.concat(
    [
        df_publication_id,
        df_gene_target_symbol_name,
        df_gene_target_ncbi_id,
        df_gene_target_species,
        df_siRNA_duplex_id,
        df_cell_line_donor,
        df_siRNA_concentration,
        df_Transfection_method,
        df_Duration_after_transfection_h,
        siRNA_feat_builder(df.siRNA_sense_seq, False),
        siRNA_feat_builder(df.siRNA_antisense_seq, True),
        df['mRNA_remaining_pct'].to_frame(name='mRNA_remaining_pct'),
    ],
    axis=1,
)

# 数据集划分和标准化
X = feats.iloc[:n_original, :-1]
y = feats.iloc[:n_original, -1]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 网格搜索调优
gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', random_state=42)
param_grid = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],
    'n_estimators': [1000, 5000, 10000]
}

grid = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid.fit(X_train, y_train)

best_params = grid.best_params_
print(f"Best parameters found: {best_params}")

# 使用最优参数重新训练模型
best_gbm = lgb.train(
    {**best_params, "metric": "rmse"},
    lgb.Dataset(X_train, label=y_train),
    num_boost_round=best_params['n_estimators'],
    valid_sets=[lgb.Dataset(X_test, label=y_test)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100),
    ],
)

# 预测和结果保存
X_submit = scaler.transform(feats.iloc[n_original:, :-1])
y_pred = best_gbm.predict(X_submit)

df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("submission.csv", index=False)

2.代码分析

2.1 分类特征的 One-Hot 编码

def get_dummies_with_prefix(df, column, prefix):
    dummies = pd.get_dummies(df[column], prefix=f"feat_{prefix}")
    return dummies

对指定的列进行 One-Hot 编码,并为每个新生成的列添加特定前缀。这样做是为了将分类特征转化为模型可以处理的数值形式。

2.2 网格搜索调优

gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', random_state=42)
param_grid = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],
    'n_estimators': [1000, 5000, 10000]
}

grid = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid.fit(X_train, y_train)

best_params = grid.best_params_
print(f"Best parameters found: {best_params}")

使用 GridSearchCV 进行网格搜索,以找到最优的超参数组合。参数网格包括 max_depth, learning_rate, num_leaves, n_estimators。使用5折交叉验证,并输出每次的得分。

  • 3
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值