Datawhale AI 夏令营——基于AI预测siRNA药物药效 Task3

在gbm模型基础上,通过添加序列比对分数特征和优化模型参数提高分数

1. 载入库

from Bio import pairwise2
import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error  # Ensure this import is included

2. 转换序列为字符格式


#转为字符串
df['siRNA_antisense_seq'] = df['siRNA_antisense_seq'].astype(str)
df['siRNA_sense_seq'] = df['siRNA_sense_seq'].astype(str)
df['gene_target_seq'] = df['gene_target_seq'].astype(str)

3. 定义比对分数的函数


#定义对比分数
def calculate_alignment_score(row, seq_type='sense'):
    siRNA_seq = row['siRNA_sense_seq'] if seq_type == 'sense' else row['siRNA_antisense_seq']
    target_seq = row['gene_target_seq']
    score = pairwise2.align.globalxx(siRNA_seq, target_seq, score_only=True)
    return score


4. 运行得到比对分数并将其与原特征列表合并


sense_score = df.apply(calculate_alignment_score, axis=1, seq_type='sense')
antisense_score = df.apply(calculate_alignment_score, axis=1, seq_type='antisense')

#插入列
feats['sense_score'] = sense_score
feats['antisense_score'] = antisense_score

mRNA_remaining_pct = feats.pop('mRNA_remaining_pct')
feats['mRNA_remaining_pct'] = mRNA_remaining_pct

5. 贝叶斯优化gbm模型参数

def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart'])
    }

    # Create and fit the model
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)

    # Predict on validation set
    y_pred = model.predict(X_test)

    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters and best score
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

6. 根据最佳参数获得最后模型

# Train the final model with the best parameters
best_params = study.best_params
final_model = LGBMRegressor(**best_params)
final_model.fit(X_train, y_train)

7. 最后模型对数据进行预测

y_pred = final_model.predict(feats.iloc[n_original:, :-1])
df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("submission.csv", index=False)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值