AI for Science 生命科学赛道Baseline 逐行注释
import numpy as np
import pandas as pd
import polars as pl
from collections import defaultdict, Counter
import xgb
import lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
import sys, os, gc, argparse, warnings
warnings.filterwarnings('ignore')
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
path = 'ai4bio'
traindata = pd.read_csv(f'{path}/traindata.csv', nrows=10000)
trainmap = pd.read_csv(f'{path}/trainmap.csv')
testdata = pd.read_csv(f'{path}/ai4bio_testset_final/testdata.csv', nrows=10000)
testmap = pd.read_csv(f'{path}/ai4bio_testset_final/testmap.csv')
traindata = traindata.set_index('cpgsite')
traindata = traindata.T
traindata = traindata.reset_index()
traindata = traindata.rename(columns={'index':'sample_id'})
traindata.columns = ['sample_id'] + [i for i in range(10000)]
traindata.to_pickle(f'{path}/traindata.pkl')
testdata = testdata.set_index('cpgsite')
testdata = testdata.T
testdata = testdata.reset_index()
testdata = testdata.rename(columns={'index':'sample_id'})
testdata.columns = ['sample_id'] + [i for i in range(10000)]
testdata.to_pickle(f'{path}/testdata.pkl')
trainmap.head()
traindata.head()
traindata.info()
for i in range(10):
null_cnt = traindata[i].isnull().sum() / traindata.shape[0]
print(f'特征{i},对应的缺失率为{null_cnt}')
traindata[[i for i in range(1000)]].corr()
traindata = traindata.merge(trainmap[['sample_id', 'age', 'gender', 'sample_type', 'disease']],on='sample_id',how='left')
testdata = testdata.merge(testmap[['sample_id', 'gender']],on='sample_id',how='left')
disease_mapping = {
'control': 0,
"Alzheimer's disease": 1,
"Graves' disease": 2,
"Huntington's disease": 3,
"Parkinson's disease": 4,
'rheumatoid arthritis': 5,
'schizophrenia': 6,
"Sjogren's syndrome": 7,
'stroke': 8,
'type 2 diabetes': 9
}
sample_type_mapping = {'control': 0, 'disease tissue': 1}
gender_mapping = {'F': 0, 'M': 1}
traindata['disease'] = traindata['disease'].map(disease_mapping)
traindata['sample_type'] = traindata['sample_type'].map(sample_type_mapping)
traindata['gender'] = traindata['gender'].map(gender_mapping)
testdata['gender'] = testdata['gender'].map(gender_mapping)
traindata['max'] = traindata[[i for i in range(10000)]].max(axis=1)
traindata['min'] = traindata[[i for i in range(10000)]].min(axis=1)
traindata['std'] = traindata[[i for i in range(10000)]].std(axis=1)
traindata['var'] = traindata[[i for i in range(10000)]].var(axis=1)
traindata['skew'] = traindata[[i for i in range(10000)]].skew(axis=1)
traindata['mean'] = traindata[[i for i in range(10000)]].mean(axis=1)
traindata['median'] = traindata[[i for i in range(10000)]].median(axis=1)
testdata['max'] = testdata[[i for i in range(10000)]].max(axis=1)
testdata['min'] = testdata[[i for i in range(10000)]].min(axis=1)
testdata['std'] = testdata[[i for i in range(10000)]].std(axis=1)
testdata['var'] = testdata[[i for i in range(10000)]].var(axis=1)
testdata['skew'] = testdata[[i for i in range(10000)]].skew(axis=1)
testdata['mean'] = testdata[[i for i in range(10000)]].mean(axis=1)
testdata['median'] = testdata[[i for i in range(10000)]].median(axis=1)
cols = [i for i in range(10000)] + ['gender','max','min','std','var','skew','mean','median']
def catboost_model(train_x, train_y, test_x, seed = 2023):
folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
oof = np.zeros(train_x.shape[0])
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
params = {'learning_rate': 0.1,
'depth': 5,
'bootstrap_type':'Bernoulli',
'random_seed':2023,
'od_type': 'Iter',
'od_wait': 100,
'allow_writing_files': False,
'task_type':"GPU",
'devices':'0:1'}
model = CatBoostRegressor(iterations=500, **params)
model.fit(trn_x, trn_y,
eval_set=(val_x, val_y),
metric_period=500,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
score = mean_absolute_error(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
if i == 0:
fea_ = model.feature_importances_
fea_name = model.feature_names_
fea_score = pd.DataFrame({'fea_name':fea_name, 'score':fea_})
fea_score = fea_score.sort_values('score', ascending=False)
fea_score.to_csv('feature_importances.csv', index=False)
return oof, test_predict
cat_oof, cat_test = catboost_model(traindata[cols], traindata['age'], testdata[cols])
testdata['age'] = cat_test
testdata['age'] = testdata['age'].astype(float)
testdata['age'] = testdata['age'].apply(lambda x: x if x>0 else 0.0)
testdata['age'] = testdata['age'].apply(lambda x: '%.2f' % x)
testdata['age'] = testdata['age'].astype(str)
testdata[['sample_id','age']].to_csv('submit.txt',index=False)