在gbm模型基础上,通过添加序列比对分数特征和优化模型参数提高分数
1. 载入库
from Bio import pairwise2
import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # Ensure this import is included
2. 转换序列为字符格式
#转为字符串
df['siRNA_antisense_seq'] = df['siRNA_antisense_seq'].astype(str)
df['siRNA_sense_seq'] = df['siRNA_sense_seq'].astype(str)
df['gene_target_seq'] = df['gene_target_seq'].astype(str)
3. 定义比对分数的函数
#定义对比分数
def calculate_alignment_score(row, seq_type='sense'):
siRNA_seq = row['siRNA_sense_seq'] if seq_type == 'sense' else row['siRNA_antisense_seq']
target_seq = row['gene_target_seq']
score = pairwise2.align.globalxx(siRNA_seq, target_seq, score_only=True)
return score
4. 运行得到比对分数并将其与原特征列表合并
sense_score = df.apply(calculate_alignment_score, axis=1, seq_type='sense')
antisense_score = df.apply(calculate_alignment_score, axis=1, seq_type='antisense')
#插入列
feats['sense_score'] = sense_score
feats['antisense_score'] = antisense_score
mRNA_remaining_pct = feats.pop('mRNA_remaining_pct')
feats['mRNA_remaining_pct'] = mRNA_remaining_pct
5. 贝叶斯优化gbm模型参数
def objective(trial):
# Define hyperparameters to optimize
params = {
'objective': 'regression',
'metric': 'rmse',
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart'])
}
# Create and fit the model
model = LGBMRegressor(**params)
model.fit(X_train, y_train)
# Predict on validation set
y_pred = model.predict(X_test)
# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
return rmse
# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
# Print best parameters and best score
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)
6. 根据最佳参数获得最后模型
# Train the final model with the best parameters
best_params = study.best_params
final_model = LGBMRegressor(**best_params)
final_model.fit(X_train, y_train)
7. 最后模型对数据进行预测
y_pred = final_model.predict(feats.iloc[n_original:, :-1])
df_submit["mRNA_remaining_pct"] = y_pred
df_submit.to_csv("submission.csv", index=False)