分析代码来自Machine learning from omics data,从 main 开始
文章目录
#main.py
current_file_path = os.path.dirname(os.path.realpath('__file__'))
base_dir = os.path.join(current_file_path, os.path.pardir)
- 加入这两行代码放在 a/b/c.py 中,os.path.realpath(‘__file__’) 得到结果是 a/b/_file_,os.path.dirname(os.path.realpath(‘__file__’)) 得到的结果是 a/b,即 os.getcwd(),current_file_path 是当前脚本的目录完整路径
- os.path.pardir 得到父目录,这里输出是 …,os.path.join(current_file_path, os.path.pardir) 得到 a/b/…
1.dl_omics
#main.py
nested_group_cv = False
max_local_threads = 4
target_name = 'dili'
df, genes, meta_columns = create_l1000_df()
#dl_omics.py
current_file_path = os.path.dirname(os.path.realpath('__file__'))
data_path = os.path.join(current_file_path, os.pardir, 'data')
meta_data_url = 'https://ndownloader.figstatic.com/files/25612817'
meta_data_file = os.path.join(data_path, 'Table_1_meta_data.xlsx')
raw_counts_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92742&format=file&file' \
'=GSE92742%5FBroad%5FLINCS%5FLevel5%5FCOMPZ%2EMODZ%5Fn473647x12328%2Egctx%2Egz'
raw_counts_file_compressed = os.path.join(
data_path, 'GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx.gz')
raw_counts_file = raw_counts_file_compressed[:-3]
gene_info_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92742&format=file&file' \
'=GSE92742%5FBroad%5FLINCS%5Fgene%5Finfo%2Etxt%2Egz'
gene_info_file = os.path.join(data_path, 'GSE92742_Broad_LINCS_gene_info.txt.gz')
l1000_df_file = os.path.join(data_path, 'l1000_df.bin')
N_GENES = 12_328
def create_l1000_df():
os.makedirs(data_path, exist_ok=True)
if os.path.exists(l1000_df_file):
return load(l1000_df_file)
if not os.path.exists(raw_counts_file):
if not os.path.exists(raw_counts_file_compressed):
print('Downloading counts from GEO...')
urlretrieve(raw_counts_url, filename=raw_counts_file_compressed)
gunzip(raw_counts_file_compressed, raw_counts_file)
if not os.path.exists(gene_info_file):
urlretrieve(gene_info_url, filename=gene_info_file)
if not os.path.exists(meta_data_file):
urlretrieve(meta_data_url, filename=meta_data_file)
meta_df = read_excel(meta_data_file)
sample_names = meta_df['NIH LINCS L1000 sig_id']
gene_df = read_csv(gene_info_file, sep='\t')
assert gene_df.shape[0] == N_GENES
gene_df = gene_df.set_index('pr_gene_id')
counts_df = parse.parse(raw_counts_file).data_df
assert counts_df.shape[0] == N_GENES
counts_df = counts_df[sample_names]
counts_df.index = counts_df.index.astype(int)
df = counts_df.join(gene_df, how='inner')
assert df.shape[0] == N_GENES
df = df.loc[df.pr_is_lm == 1, :]
df = df.set_index('pr_gene_symbol')
assert df.index.is_unique
df = df[sample_names].T
columns_in_excel_table = [
'NIH LINCS L1000 sig_id',
'Usage',
'Compound Names',
'DILIst binaray classfication ',
'Cell_id',
'Dose',
'Treatment Duration (hr)']
meta_df = meta_df[columns_in_excel_table]
meta_df.columns = [
'sample_id',
'set',
'compound',
'dili',
'cell_id',
'dose',
'duration'
]
meta_df = meta_df.set_index('sample_id')
meta_columns = meta_df.columns
genes = df.columns.to_list()
df = df.join(meta_df)
dump((df, genes, meta_columns), l1000_df_file, compress=3)
return df, genes, meta_columns
-
分别构建文件名字符串 meta_data_file,raw_counts_file,gene_info_file,l1000_df_file,如果不存在文件就下载对应的数据
-
返回构建好的数据集和列名
2.main
train_index = df['set'] == 'Training'
validation_index = df['set'] == 'Test'
X_train, y_train = df.loc[train_index, genes], df.loc[train_index, target_name]
X_validation, y_validation = df.loc[validation_index, genes], \
df.loc[validation_index, target_name]
svc = svm.LinearSVC(max_iter=50_000)
selector = feature_selection.SelectKBest(
score_func=feature_selection.mutual_info_classif
)
estimator = pipeline.Pipeline([
('scaler', preprocessing.StandardScaler()),
('selector', selector),
('svc', svc)
])
scaler_grid = ['passthrough', preprocessing.StandardScaler(), preprocessing.RobustScaler()]
k_grid = np.arange(10, 201, 10)
c_grid = 10. ** np.arange(-2, 3)
param_grid = {
'scaler': scaler_grid,
'selector__k': k_grid,
'svc__C': c_grid
}
inner_cv = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
grid_search_file = 'grid_search.pkl'
grid_search_file = os.path.join(base_dir, grid_search_file)
mcc_scorer = metrics.make_scorer(metrics.matthews_corrcoef, greater_is_better=True)
try:
search = joblib.load(grid_search_file)
except FileNotFoundError:
n_configurations = np.product(list(map(len, param_grid.values())))
n_jobs = int(os.environ.get('SLURM_CPUS_ON_NODE', max_local_threads))
print(f'Trying {n_configurations} different configurations using {n_jobs} CPUs')
search = model_selection.GridSearchCV(
estimator, param_grid, cv=inner_cv, scoring=mcc_scorer, n_jobs=n_jobs,
refit=False, return_train_score=True, verbose=2
)
search.fit(X_train, y_train, groups=df.loc[train_index, 'compound'])
joblib.dump(search, grid_search_file, compress=3)
print(search.best_params_)
print(search.best_score_)
estimator.set_params(**search.best_params_)
estimator.fit(X_train, y_train)
n_selected_features = sum(estimator.named_steps['selector'].get_support())
n_features = X_train.shape[1]
print(f'{n_selected_features} out of {n_features} features used')
y_validation_predicted = estimator.predict(X_validation)
accuracy = metrics.accuracy_score(y_validation, y_validation_predicted)
mcc = metrics.matthews_corrcoef(y_validation, y_validation_predicted)
print('accuracy:', accuracy)
print('MCC:', mcc)
models = []
for train_index_num, _ in inner_cv.split(df[genes], df[target_name], groups=df['compound']):
estimator = deepcopy(estimator)
estimator.set_params(**search.best_params_)
estimator.fit(df[genes].iloc[train_index_num], df[target_name].iloc[train_index_num])
models.append(estimator)
models_file = 'models_nested.pkl' if nested_group_cv else 'models.pkl'
models_file = os.path.join(base_dir, models_file)
joblib.dump(models, models_file, compress=3)
- 进行特征标准化,根据互信息选择特征,使用 SVM 分类模型
- model_selection.RepeatedStratifiedKFold 是 K-折交叉验证类
- metrics.matthews_corrcoef 作为打分参数构建评分类
- model_selection.GridSearchCV 网格搜索最优超参数
- 将训练好的模型保存在 models.pkl 文件中
3.create_plots
matplotlib.use('Agg')
sns.set(style='whitegrid')
sns.set_context('paper', font_scale=1.3)
current_file_path = os.path.dirname(os.path.realpath(__file__))
base_dir = os.path.join(current_file_path, os.path.pardir)
def main():
image_folder = os.path.join(base_dir, 'figures')
os.makedirs(image_folder, exist_ok=True)
grid_search_file = 'grid_search.pkl'
grid_search_file = os.path.join(base_dir, grid_search_file)
create_grid_search_plots(grid_search_file, image_folder)
create_data_plots(image_folder)
create_coeff_boxplot(image_folder)
- 依次画出超参数选择,数据,特征贡献图
3.1.create_grid_search_plots
def create_grid_search_plots(grid_search_file, image_folder):
search = joblib.load(grid_search_file)
search_result_df = pd.DataFrame(search.cv_results_)
param_name = 'param_selector__k'
param_display_name = 'k'
# find configuration with best test score for each k
best_score_per_k_index = search_result_df.groupby(param_name)['mean_test_score']\
.idxmax()
search_result_df = search_result_df.loc[best_score_per_k_index, :]
# convert results to long format
param_names = ['param_scaler', 'param_selector__k', 'param_svc__C']
train_split_names = [c for c in search_result_df.columns if
c.startswith('split') and c.endswith('train_score')]
test_split_names = [c for c in search_result_df.columns if
c.startswith('split') and c.endswith('test_score')]
data = []
for index, row in search_result_df.iterrows():
param_values = row[param_names].tolist()
train_scores = row[train_split_names].tolist()
test_scores = row[test_split_names].tolist()
for train_score in train_scores:
data.append(param_values + ['train', train_score, row.mean_train_score, index])
for test_score in test_scores:
data.append(param_values + ['test', test_score, row.mean_test_score, index])
plot_data = pd.DataFrame(
data, columns=['scaler', 'k', 'C', 'split', 'MCC', 'mean', 'index'])
plot_data['scaler'] = plot_data['scaler'].astype(str)
plot_data = plot_data.rename(columns={'split': 'Split'})
fig, ax = plt.subplots(figsize=(9, 4))
sns.lineplot(
data=plot_data,
x=param_display_name, y='MCC', hue='Split', hue_order=['train', 'test'], ax=ax
)
x_ticks = sorted(plot_data[param_display_name].unique())
x_ticks = x_ticks[::2]
ax.set_xticks(x_ticks)
x = search.best_params_[param_name.replace('param_', '')]
y = search.best_score_
ax.plot(x, y, '*k', markersize=15, zorder=-1, alpha=0.8,
color=ax.lines[1].get_color())
ax.set_xlim(plot_data[param_display_name].min(), plot_data[param_display_name].max())
ax.set_xlabel('Number of features')
ax.set_ylabel('Model performance (MCC)')
image_path = os.path.join(image_folder, f'figure03_grid_search_{param_display_name}.tiff')
save_figure(image_path)
3.2.create_data_plots
def create_data_plots(image_folder):
df, genes, meta_columns = create_l1000_df()
target_name = 'DILI'
df[target_name] = df['dili'].replace({0: 'negative', 1: 'positive'})
reduced_df = create_umap_df(df, features=genes, densmap=True)
fig, ax = plt.subplots(figsize=(9, 6))
scatter_plot(reduced_df, hue='DILI', alpha=0.8, s=70, ax=ax)
image_path = os.path.join(image_folder, 'figure01_umap_DILI.tiff')
save_figure(image_path)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 3))
corr_df = df[genes].corr(method='spearman').abs()
np.fill_diagonal(corr_df.values, np.nan)
sns.histplot(corr_df.max(axis=1), bins=20, ax=ax1)
ax1.set_xlabel('Maximum absolute Spearman\'s\ncorrelation between features')
mi = mutual_info_classif(df[genes], df['dili'])
sns.histplot(mi, bins=10, ax=ax2)
ax2.set_xlabel('Mutual information with DILI class')
ax2.set_ylabel(None)
image_path = os.path.join(image_folder, 'figure02_corr_mi.tiff')
save_figure(image_path)
3.3.create_coeff_boxplot
def create_coeff_boxplot(image_folder):
df, genes, meta_columns = create_l1000_df()
models_file = os.path.join(base_dir, 'models.pkl')
models = joblib.load(models_file)
coefficient_list = []
for model in models:
selected_features = model.named_steps['selector'].get_support()
selected_features = np.array(genes)[selected_features]
coefficients = np.abs(model.named_steps['svc'].coef_)[0]
coefficient_list.append(
pd.DataFrame({'Gene': selected_features, 'Coefficient': coefficients})
)
coefficient_df = pd.concat(coefficient_list)
top_genes = coefficient_df.groupby('Gene').median().sort_values('Coefficient', ascending=False)
top_genes = top_genes.head(n=5)
plot_df = coefficient_df.loc[coefficient_df.Gene.isin(top_genes.index), :]
fig, ax = plt.subplots(figsize=(9, 4))
image_path = os.path.join(image_folder, 'figure04_coefficients.tiff')
sns.boxplot(data=plot_df, x='Gene', y='Coefficient', order=top_genes.index, ax=ax)
ax.set_ylabel('Absolute coefficient')
save_figure(image_path)