由于我刚接触py和机器学习,还有很多知识没有进行学习,task3的内容就不在进行详解了(涉及到一些很厉害的知识,我不会hh,正在进行学习中)
这里放一份dw给的资料(很丰富):特征工程进阶,持续上分:Datawhale
另外本人也是正在学习,我在task2的基础上加上了多折交叉训练
多折交叉训练(K-Fold Cross-Validation)是一种常用的机器学习评估方法,它将数据集分为K个等大的子集,其中K-1个子集用于训练模型,剩下的1个子集用于验证模型的性能。这个过程重复K次,每次使用不同的子集作为验证集,其余的子集作为训练集。最后,通常会计算所有K次验证结果的平均值或中位数来评估模型的性能。
在Python中,可以使用scikit-learn
库中的KFold
类来实现多折交叉训练。以下是一个简单的多折交叉训练的例子:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
# 假设 X 是特征矩阵,y 是标签向量
X = ... # 特征数据
y = ... # 标签数据
# 定义K折交叉验证的参数
n_splits = 5 # 定义K的值
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# 初始化模型列表
models = []
# 进行K折交叉训练
for train_index, test_index in kf.split(X):
# 划分训练集和测试集
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# 创建和训练模型
model = ... # 创建模型实例
model.fit(X_train, y_train)
# 保存模型
models.append(model)
# 评估模型
y_pred = model.predict(X_test)
print(f"Fold {len(models)} accuracy: {accuracy_score(y_test, y_pred)}")
# 最后,您可以使用models列表中的任何一个模型进行预测
# 或者对所有模型进行投票、平均或加权平均等模型融合技术
在这个例子中,我们首先定义了K折交叉验证的参数,然后使用KFold
类来划分数据集。在每次迭代中,我们都会创建和训练一个模型,并保存它。最后,我们可以使用保存的模型进行预测,或者使用模型融合技术来提高预测的准确性。
同时分数也来到了0.7960,还是没有突破80大关,后续我会加入更多的特征尝试加分
这里也为大家提供一份我的代码(不过只有0.79+):
import pandas as pd
import numpy as np
df_original = pd.read_csv("data/train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("data/sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
name = "anti" if anti else "sense"
df = s.to_frame()
df[f"feat_siRNA_{name}_seq_len"] = s.str.len()
for pos in [0, -1]:
for c in list("AUGC"):
df[f"feat_siRNA_{name}_seq_{c}_{'front' if pos == 0 else 'back'}"] = (
s.str[pos] == c
)
df[f"feat_siRNA_{name}_seq_pattern_1"] = s.str.startswith("AA") & s.str.endswith(
"UU"
)
df[f"feat_siRNA_{name}_seq_pattern_2"] = s.str.startswith("GA") & s.str.endswith(
"UU"
)
df[f"feat_siRNA_{name}_seq_pattern_3"] = s.str.startswith("CA") & s.str.endswith(
"UU"
)
df[f"feat_siRNA_{name}_seq_pattern_4"] = s.str.startswith("UA") & s.str.endswith(
"UU"
)
df[f"feat_siRNA_{name}_seq_pattern_5"] = s.str.startswith("UU") & s.str.endswith(
"AA"
)
df[f"feat_siRNA_{name}_seq_pattern_6"] = s.str.startswith("UU") & s.str.endswith(
"GA"
)
df[f"feat_siRNA_{name}_seq_pattern_7"] = s.str.startswith("UU") & s.str.endswith(
"CA"
)
df[f"feat_siRNA_{name}_seq_pattern_8"] = s.str.startswith("UU") & s.str.endswith(
"UA"
)
df[f"feat_siRNA_{name}_seq_pattern_9"] = s.str[1] == "A"
df[f"feat_siRNA_{name}_seq_pattern_10"] = s.str[-2] == "A"
df[f"feat_siRNA_{name}_seq_pattern_GC_frac"] = (
s.str.contains("G") + s.str.contains("C")
) / s.str.len()
return df.iloc[:, 1:]
df_publication_id = pd.get_dummies(df.publication_id)
df_publication_id.columns = [
f"feat_publication_id_{c}" for c in df_publication_id.columns
]
df_gene_target_symbol_name = pd.get_dummies(df.gene_target_symbol_name)
df_gene_target_symbol_name.columns = [
f"feat_gene_target_symbol_name_{c}" for c in df_gene_target_symbol_name.columns
]
df_gene_target_ncbi_id = pd.get_dummies(df.gene_target_ncbi_id)
df_gene_target_ncbi_id.columns = [
f"feat_gene_target_ncbi_id_{c}" for c in df_gene_target_ncbi_id.columns
]
df_gene_target_species = pd.get_dummies(df.gene_target_species)
df_gene_target_species.columns = [
f"feat_gene_target_species_{c}" for c in df_gene_target_species.columns
]
siRNA_duplex_id_values = df.siRNA_duplex_id.str[3:-2].str.strip(".").astype("int")
siRNA_duplex_id_values = (siRNA_duplex_id_values - siRNA_duplex_id_values.min()) / (
siRNA_duplex_id_values.max() - siRNA_duplex_id_values.min()
)
df_siRNA_duplex_id = pd.DataFrame(siRNA_duplex_id_values)
df_cell_line_donor = pd.get_dummies(df.cell_line_donor)
df_cell_line_donor.columns = [
f"feat_cell_line_donor_{c}" for c in df_cell_line_donor.columns
]
df_cell_line_donor["feat_cell_line_donor_hepatocytes"] = (
(df.cell_line_donor.str.contains("Hepatocytes")).fillna(False).astype("int")
)
df_cell_line_donor["feat_cell_line_donor_cells"] = (
df.cell_line_donor.str.contains("Cells").fillna(False).astype("int")
)
df_siRNA_concentration = df.siRNA_concentration.to_frame()
df_Transfection_method = pd.get_dummies(df.Transfection_method)
df_Transfection_method.columns = [
f"feat_Transfection_method_{c}" for c in df_Transfection_method.columns
]
df_Duration_after_transfection_h = pd.get_dummies(df.Duration_after_transfection_h)
df_Duration_after_transfection_h.columns = [
f"feat_Duration_after_transfection_h_{c}"
for c in df_Duration_after_transfection_h.columns
]
feats = pd.concat(
[
df_publication_id,
df_gene_target_symbol_name,
df_gene_target_ncbi_id,
df_gene_target_species,
df_siRNA_duplex_id,
df_cell_line_donor,
df_siRNA_concentration,
df_Transfection_method,
df_Duration_after_transfection_h,
siRNA_feat_builder(df.siRNA_sense_seq, False),
siRNA_feat_builder(df.siRNA_antisense_seq, True),
df.iloc[:, -1].to_frame(),
],
axis=1,
)
!pip install lightgbm
import lightgbm as lgb
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
feats.iloc[:n_original, :-1],
feats.iloc[:n_original, -1],
test_size=0.2,
random_state=42,
)
!pip install scikit-learn
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
# calculate_metrics函数用于计算评估指标
def calculate_metrics(preds, data, threshold=30):
y_pred = preds
y_true = data.get_label()
mae = np.mean(np.abs(y_true - y_pred))
# if mae < 0: mae = 0
# elif mae >100: mae = 100
y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)
mask = (y_pred >= 0) & (y_pred <= threshold)
range_mae = (
mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100
)
# if range_mae < 0: range_mae = 0
# elif range_mae >100: range_mae = 100
# precision = precision_score(y_true_binary, y_pred_binary, average="binary")
# recall = recall_score(y_true_binary, y_pred_binary, average="binary")
if np.sum(y_pred_binary) > 0:
precision = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_pred_binary)
else:
precision = 0
if np.sum(y_true_binary) > 0:
recall = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_true_binary)
else:
recall = 0
if precision + recall == 0:
f1 = 0
else:
f1 = 2 * precision * recall / (precision + recall)
score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
return "custom_score", score, True # True表示分数越高越好
# adaptive_learning_rate函数用于自适应学习率
def adaptive_learning_rate(decay_rate=0.8, patience=50):
best_score = float("-inf") # 初始化为负无穷,因为分数越高越好
wait = 0
def callback(env):
nonlocal best_score, wait
current_score = env.evaluation_result_list[-1][2] # 假设使用的是最后一个评估指标
current_lr = env.model.params.get('learning_rate')
if current_score > best_score:
best_score = current_score
# wait = 0 # 需要连续的score没有上升
else:
wait += 1
if wait >= patience:
new_lr = float(current_lr) * decay_rate
wait = 0
env.model.params['learning_rate'] = new_lr
print(f"Learning rate adjusted to {env.model.params.get('learning_rate')}")
return callback
weight_ls = np.array(feats['mRNA_remaining_pct'].apply(lambda x:2 if ((x<=30)and(x>=0)) else 1))
# train函数用于训练模型
def train(feats, n_original,weight_ls):
# 定义k折交叉验证
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# 开始k折交叉验证
gbms = []
for fold, (train_idx, val_idx) in enumerate(
kf.split(feats.iloc[:n_original, :]), 1
):
# 准备训练集和验证集
X_train, X_val = feats.iloc[train_idx, :-1], feats.iloc[val_idx, :-1]
y_train, y_val = feats.iloc[train_idx, -1], feats.iloc[val_idx, -1]
w_train = weight_ls[train_idx]
# 创建LightGBM数据集
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
boost_round = 25000
early_stop_rounds = int(boost_round*0.1)
# 显示metric
lgb_log = lgb.log_evaluation(period=200, show_stdv=True)
lgb_stop = lgb.early_stopping(stopping_rounds=early_stop_rounds, first_metric_only=True, verbose=True, min_delta=0.00001)
# 设置LightGBM参数
params = {
"boosting_type": "gbdt",
"objective": "regression",
"metric": "None",
# "metric": "root_mean_squared_error",
"max_depth": 8,
"num_leaves": 63,
"min_data_in_leaf": 2,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"lambda_l1": 0.1,
"lambda_l2": 0.2,
"verbose": -1, # -1时不输出
"early_stopping_round": early_stop_rounds,
"num_threads": 8,
}
# 在训练时使用自适应学习率回调函数
adaptive_lr = adaptive_learning_rate(decay_rate=0.9, patience=1000)
gbm = lgb.train(
params,
train_data,
num_boost_round=boost_round,
valid_sets=[val_data],
feval=calculate_metrics, # 将自定义指标函数作为feval参数传入
# callbacks=[print_validation_result, adaptive_lr, lgb_log, lgb_stop],
callbacks=[adaptive_lr, lgb_log, lgb_stop],
)
valid_score = gbm.best_score["valid_0"]["custom_score"]
print(f"best_valid_score: {valid_score}")
gbms.append(gbm)
return gbms
# feats: 包含特征和标签的DataFrame
# n_original: 训练数据的数量
# weight_ls: 训练数据的权重列表
# df_submit: 提交结果的DataFrame,它应该包含除了预测列之外的所有必要信息
# 训练模型
trained_models = train(feats, n_original,weight_ls)
# 假设你选择最后一个模型进行预测(你也可以选择最好的或者融合多个模型)
gbm = trained_models[-1]
# 对测试集进行预测
y_pred = gbm.predict(feats.iloc[n_original:, :-1])
# 将预测结果添加到提交DataFrame中
df_submit["mRNA_remaining_pct"] = y_pred # 假设这是你需要提交的列名
# 保存提交结果到submit.csv文件
df_submit.to_csv("submit.csv", index=False)