DW夏令营1期-零基础入门 AI（机器学习）竞赛

最新推荐文章于 2024-08-24 13:30:04 发布

知源书院

最新推荐文章于 2024-08-24 13:30:04 发布

阅读量256

点赞数 4

分类专栏：机器学习文章标签：人工智能机器学习

本文链接：https://blog.csdn.net/cfy2401926342/article/details/140248188

版权

机器学习专栏收录该内容

35 篇文章 6 订阅

订阅专栏

#这个夏令营不简单 #AI夏令营 #Datawhale #夏令营

想参加的宝字随时联系我，AI竞赛讨论群：983387298

1、baseline解读

# 1. 导入需要用到的相关库
# 导入 pandas 库，用于数据处理和分析
import pandas as pd
# 导入 numpy 库，用于科学计算和多维数组操作
import numpy as np
# 从 lightgbm 模块中导入 LGBMClassifier 类
from lightgbm import LGBMClassifier

# 2. 读取训练集和测试集
# 使用 read_excel() 函数从文件中读取训练集数据，文件名为 'traindata-new.xlsx'
train = pd.read_excel('./data/data280993/traindata-new.xlsx')
# 使用 read_excel() 函数从文件中读取测试集数据，文件名为 'testdata-new.xlsx'
test = pd.read_excel('./data/data280993/testdata-new.xlsx')

# 3 特征工程
# 3.1 test数据不包含 DC50 (nM) 和 Dmax (%)，将train数据中的DC50 (nM) 和 Dmax (%)删除
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)

# 3.2 将object类型的数据进行目标编码处理
for col in train.columns[2:]:if train[col].dtype == object or test[col].dtype == object:
        train[col] = train[col].isnull()
        test[col] = test[col].isnull()

# 4. 加载决策树模型进行训练
model = LGBMClassifier(verbosity=-1)
model.fit(train.iloc[:, 2:].values, train['Label'])
pred = model.predict(test.iloc[:, 1:].values, )

# 5. 保存结果文件到本地
pd.DataFrame(
    {'uuid': test['uuid'],'Label': pred
    }
).to_csv('submit.csv', index=None)

2、上分秘籍

数据字段理解
赛题介绍部分并没有给出关于数据字段的具体介绍，所以这里对每个数据字段进行解释：
- UUID: 一个唯一的标识符，用于识别数据记录。
- Label: 可能指的是化合物或数据集的标签或名称。
- Uniprot: 一个蛋白质数据库，提供关于蛋白质序列、结构和功能的信息。
- Target: 目标蛋白，即PROTACs设计来降解的特定蛋白质。
- E3 ligase: E3连接酶，一种在泛素化过程中扮演角色的酶，帮助标记蛋白质以供降解。
- PDB: 蛋白质数据银行，一个包含蛋白质和核酸结构的数据库。
- Name: 化合物的名称。
- Smiles: 一种表示分子结构的字符串格式。
- DC50 (nM): 半最大降解浓度，即达到最大降解效果一半时的化合物浓度。
- Dmax (%): 最大降解效率，表示化合物能实现的最大降解效果的百分比。
- Assay: 实验方法，这里可能指的是用于测定DC50、Dmax等的实验。
- Percent degradation: 分子降解的百分比。
- IC50 (nM, Protac to Target): 半最大抑制浓度，针对PROTACs与其目标蛋白的复合物。
- EC50 (nM, Protac to Target): 半最大有效浓度，即达到一半最大生物效应的浓度。
- Kd (nM, Protac to Target): 解离常数，表示PROTACs与其目标蛋白复合物的结合强度。
- Ki (nM, Protac to Target): 抑制常数，用于描述抑制剂与酶的结合强度。
- delta G, delta H, -T*delta S: 分别表示结合反应的吉布斯自由能变化、焓变和熵变，这些热力学参数用于描述分子间相互作用的能量状态。
- kon, koff: 分别表示分子结合和解离的速率常数。
- t1/2 (s, Protac to Target): 半衰期，即分子浓度减少到初始浓度一半所需的时间。
- Assay (Protac to Target, kon/koff/t1/2): 可能是指测定结合和解离速率常数以及半衰期的实验方法。

#1、CatBoost：

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
train = pd.read_excel('traindata-new.xlsx')
test = pd.read_excel('testdata-new.xlsx')
# test数据不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)
# 定义了一个空列表drop_cols，用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
    if test[f].notnull().sum() < 10:
        drop_cols.append(f)
# 使用drop方法从训练集和测试集中删除了这些列，以避免在后续的分析或建模中使用这些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame，便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]
# 将SMILES转换为分子对象列表,并转换为SMILES字符串列表
data['smiles_list'] = data['Smiles'].apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))
# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data['smiles_list'])
# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]
# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)

# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))
for col in cols:
    if data[col].dtype == 'object':
        data[col]  = label_encode(data[col])
train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

# 特征筛选
features = [f for f in train.columns if f not in ['uuid','Label','smiles_list']]

# 构建训练集和测试集
x_train = train[features]
x_test = test[features]
# 训练集标签
y_train = train['Label'].astype(int)

def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2022):
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])
    cv_scores = []
    # 100， 1 2 3 4 5
    # 1 2 3 4    5
    # 1 2 3 5。  4
    # 1
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} {}************************************'.format(str(i+1), str(seed)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        params = {'learning_rate': 1, 'depth': 6, 'l2_leaf_reg': 11, 'bootstrap_type':'Bernoulli','random_seed':seed,
                  'od_type': 'Iter', 'od_wait': 100, 'allow_writing_files': False, 'task_type':'CPU'}
        model = clf(iterations=20000, **params, eval_metric='AUC')
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=100,
                  cat_features=[],
                  use_best_model=True,
                  verbose=1)
        val_pred  = model.predict_proba(val_x)[:,1]
        test_pred = model.predict_proba(test_x)[:,1]
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(f1_score(val_y, np.where(val_pred>0.6, 1, 0)))
        print("f1_score是：",end='')
        print(cv_scores)
    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test
cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, "cat")
pd.DataFrame(
    {
        'uuid': test['uuid'],
        'Label': np.where(cat_test>0.5, 1, 0)
    }
).to_csv('submit.csv', index=None)

参数调节：
learning_rate': 1
'depth': 6
'l2_leaf_reg': 11

参考大佬代码，改进一下！

先运行基础代码：
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import catboost as cat
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
import tqdm, sys, os, gc, re, argparse, warnings
warnings.filterwarnings('ignore')

# 2. 读取训练集和测试集
train_data_file = "./dataset-new/traindata-new.xlsx"
test_data_file = "./dataset-new/testdata-new.xlsx"

train = pd.read_excel(train_data_file, sheet_name='Sheet1')
test = pd.read_excel(test_data_file, sheet_name='Sheet1')

# test数据不包含 DC50 (nM) 和 Dmax (%)，将train数据中的DC50 (nM) 和 Dmax (%)删除
train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1, inplace=True)

# 定义了一个空列表drop_cols，用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
    if test[f].notnull().sum() < 10:
        drop_cols.append(f)

# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame，便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)

# 使用drop方法删除了这些列，以避免在后续的分析或建模中使用这些包含大量缺失值的列
data_smiles = data['Smiles']
data_inchis = data['InChI']
# 另删掉一些一看就用不到的特征
drop_cols.extend(['Name', 'PDB', 'Smiles', 'Article DOI', 'Molecular Formula', 'InChI', 'InChI Key'])
data = data.drop(drop_cols, axis=1)

# 将SMILES转换为分子对象列表,并转换为SMILES字符串列表
data_smiles_transversed = data_smiles.apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data_smiles_transversed = data_smiles_transversed.map(lambda x: ' '.join(x))

# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data_smiles_transversed)

# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]

# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)

# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

cols = data.columns[2:]
for col in cols:
    if data[col].dtype == 'object':
        data[col] = label_encode(data[col])
以下为Task 3：
# 构建训练集和测试集
train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

x_train = train[cols]
x_test = test[cols]

# 训练集标签
y_train = train['Label'].astype(int)
print(x_train.shape, y_train.shape, x_test.shape)

def cv_model(clf, train_x, train_y, test_x, seed=2022):
    clf_name = clf().__class__.__name__

    kf = KFold(n_splits=5, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} {}************************************'.format(str(i+1), str(seed)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        params = {'learning_rate': 1, 'depth': 6, 'l2_leaf_reg': 100, 'bootstrap_type':'Bernoulli','random_seed':seed,
                  'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False, 'task_type':'CPU'}

        model = clf(iterations=20000, **params, eval_metric='AUC')
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=100,
                  cat_features=[],
                  use_best_model=True, 
                  verbose=0)

        val_pred  = model.predict_proba(val_x)[:,1]
        test_pred = model.predict_proba(test_x)[:,1]
        
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(f1_score(val_y, np.where(val_pred>0.5, 1, 0)))
        
        print(cv_scores)
        
    print(f"{clf_name}_score_list:", cv_scores)
    print(f"{clf_name}_score_mean:", np.mean(cv_scores))
    print(f"{clf_name}_score_std:", np.std(cv_scores))
    return train, test
    
cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test)

pred = np.where(final_test>0.5, 1, 0)

pd.DataFrame(
    {
        'uuid': test['uuid'],
        'Label': pred
    }
).to_csv('submit.csv', index=None)
print(pred.sum())
以下为Task4的基础代码：
atomic_masses = {
    'H': 1.008, 'He': 4.002602, 'Li': 6.94, 'Be': 9.0122, 'B': 10.81, 'C': 12.01,
    'N': 14.01, 'O': 16.00, 'F': 19.00, 'Ne': 20.180, 'Na': 22.990, 'Mg': 24.305,
    'Al': 26.982, 'Si': 28.085, 'P': 30.97, 'S': 32.07, 'Cl': 35.45, 'Ar': 39.95,
    'K': 39.10, 'Ca': 40.08, 'Sc': 44.956, 'Ti': 47.867, 'V': 50.942, 'Cr': 52.00,
    'Mn': 54.938, 'Fe': 55.845, 'Co': 58.933, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.38
}

# 函数用于解析单个InChI字符串
def parse_inchi(inchi_str):
    formula = ''
    molecular_weight = 0
    element_counts = {}
    
    # 提取分子式
    formula_match = re.search(r"InChI=1S/([^/]+)/c", inchi_str)
    if formula_match:
        formula = formula_match.group(1)
    
    # 计算分子量和原子计数
    for element, count in re.findall(r"([A-Z][a-z]*)([0-9]*)", formula):
        count = int(count) if count else 1
        element_mass = atomic_masses.get(element.upper(), 0)
        molecular_weight += element_mass * count
        element_counts[element.upper()] = count
    
    return pd.Series({
        'Formula': formula,
        'MolecularWeight': molecular_weight,
        'ElementCounts': element_counts
    })

# 应用函数到DataFrame的每一行
# train[['Formula', 'MolecularWeight', 'ElementCounts']] = train.apply(parse_inchi, axis=1)
element_counts = data_inchis.apply(parse_inchi)['ElementCounts']

# 定义存在的key
keys = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']

# 创建一个空的DataFrame，列名为keys
# df_expanded = pd.DataFrame({key: pd.Series() for key in keys})
df_expanded = pd.DataFrame(columns=keys).astype(float)

# 遍历数据，填充DataFrame
for index, item in enumerate(element_counts.values):
    for key in keys:
        # 将字典中的值填充到相应的列中
        df_expanded.at[index, key] = item.get(key, 0)   # faster
        # df_expanded[key][index] = item.get(key, 0)    # solwer

# 合并数据
data = pd.concat([data, df_expanded], axis=1)

# 构建训练集和测试集
def clean_feature_names(df):
    df.columns = df.columns.str.replace(r'[^A-Za-z0-9_]', '_', regex=True)
    return df

train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

cols = data.columns[2:]
x_train = train[cols]
x_test = test[cols]

x_train = clean_feature_names(x_train)
x_test = clean_feature_names(x_test)

# 训练集标签
y_train = train['Label'].astype(int)

def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023):
    folds = 5
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == "lgb":
            train_matrix = lgb.Dataset(trn_x, label=trn_y)
            valid_matrix = lgb.Dataset(val_x, label=val_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'min_child_weight': 6,
                'num_leaves': 2 ** 6,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.35,
                'seed': 2024,
                'nthread': 16,
                'early_stopping_rounds': 100,
                # 'verbose_eval': 1000,
                'verbose' : -1,
            }
            model = lgb.train(params, train_matrix, num_boost_round=2000, valid_sets=[train_matrix, valid_matrix],
                              categorical_feature=[])
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        
        if clf_name == "xgb":
            xgb_params = {
              'booster': 'gbtree', 
              'objective': 'binary:logistic',
            #   'num_class':3,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.35,
              'tree_method': 'hist',
              'seed': 520,
              'nthread': 16
              }
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=100)
            val_pred  = model.predict(valid_matrix)
            test_pred = model.predict(test_matrix)
            
        if clf_name == "cat":
            params = {'learning_rate': 0.35, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2024,
                      'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=2000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      metric_period=1000,
                      use_best_model=True, 
                      cat_features=[],
                      verbose=0)
            
            val_pred  = model.predict_proba(val_x)[:, 1]
            test_pred = model.predict_proba(test_x)[:, 1]
        
        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits
        
        F1_score = f1_score(val_y, np.where(val_pred>0.5, 1, 0))
        cv_scores.append(F1_score)
        print(F1_score)
    
    return oof, test_predict, cv_scores

# 参考demo,具体对照baseline实践部分调用cv_model函数
# 选择lightgbm模型
lgb_oof, lgb_test, lgb_scores = cv_model(LGBMClassifier, x_train, y_train, x_test, 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test, xgb_scores = cv_model(xgb, x_train, y_train, x_test, 'xgb')
# 选择catboost模型
cat_oof, cat_test, cat_scores = cv_model(CatBoostClassifier, x_train, y_train, x_test, 'cat')


from IPython.display import clear_output
clear_output()
print('LightGBM F1-score:', np.mean(lgb_scores))
print('LightGBM standard deviation:', np.std(lgb_scores))
print('XGBoost F1-score:', np.mean(xgb_scores))
print('XGBoost standard deviation:', np.std(xgb_scores))
print('CatBoost F1-score:', np.mean(cat_scores))
print('CatBoost standard deviation:', np.std(cat_scores))
以下为Task4的结果取平均融合方法：
# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3

pred = np.where(final_test>0.5, 1, 0)

pd.DataFrame(
    {
        'uuid': test['uuid'],
        'Label': pred
    }
).to_csv('submit.csv', index=None)
print(pred.sum())



#以下为Task4的Stacking方法：
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score

train_stack = np.stack([lgb_oof, xgb_oof, cat_oof], axis=1)
test_stack = np.stack([lgb_test, xgb_test, cat_test], axis=1)
print(train_stack.shape, test_stack.shape)

oof = np.zeros((train_stack.shape[0]))
predictions = np.zeros((test_stack.shape[0]))
scores = []

folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack)): 
    print("fold n°{}".format(fold_+1))

    trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
    val_data, val_y = train_stack[val_idx], y_train[val_idx]
    
    clf = Ridge(random_state=2024)
    clf.fit(trn_data, trn_y)

    oof[val_idx] = clf.predict(val_data)
    predictions += clf.predict(test_stack) / (folds.n_repeats * 5)
    
    score_single = roc_auc_score(val_y, oof[val_idx])
    scores.append(score_single)
    print(f'{fold_+1}/{folds.n_repeats * 5}', score_single)
print('mean: ',np.mean(scores))

pred = np.where(predictions>0.5, 1, 0)

pd.DataFrame(
    {
        'uuid': test['uuid'],
        'Label': pred
    }
).to_csv('submit.csv', index=None)
print(pred.sum())