sklearn 机器学习流程

_森罗万象

已于 2022-09-21 16:50:38 修改

阅读量522

点赞数

分类专栏：代码解析文章标签：机器学习 sklearn

于 2022-09-21 16:49:12 首次发布

本文链接：https://blog.csdn.net/weixin_52812620/article/details/126976061

版权

代码解析专栏收录该内容

21 篇文章 51 订阅

订阅专栏

分析代码来自Machine learning from omics data，从 main 开始

文章目录

1.dl_omics
2.main
3.create_plots

#main.py
current_file_path = os.path.dirname(os.path.realpath('__file__'))
base_dir = os.path.join(current_file_path, os.path.pardir)

加入这两行代码放在 a/b/c.py 中，os.path.realpath(‘__file__’) 得到结果是 a/b/_file_，os.path.dirname(os.path.realpath(‘__file__’)) 得到的结果是 a/b，即 os.getcwd()，current_file_path 是当前脚本的目录完整路径
os.path.pardir 得到父目录，这里输出是 …，os.path.join(current_file_path, os.path.pardir) 得到 a/b/…

1.dl_omics

#main.py
nested_group_cv = False
max_local_threads = 4

target_name = 'dili'
df, genes, meta_columns = create_l1000_df()

#dl_omics.py
current_file_path = os.path.dirname(os.path.realpath('__file__'))
data_path = os.path.join(current_file_path, os.pardir, 'data')

meta_data_url = 'https://ndownloader.figstatic.com/files/25612817'
meta_data_file = os.path.join(data_path, 'Table_1_meta_data.xlsx')

raw_counts_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92742&format=file&file' \
                 '=GSE92742%5FBroad%5FLINCS%5FLevel5%5FCOMPZ%2EMODZ%5Fn473647x12328%2Egctx%2Egz'

raw_counts_file_compressed = os.path.join(
    data_path, 'GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx.gz')
raw_counts_file = raw_counts_file_compressed[:-3]

gene_info_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92742&format=file&file' \
                '=GSE92742%5FBroad%5FLINCS%5Fgene%5Finfo%2Etxt%2Egz'
gene_info_file = os.path.join(data_path, 'GSE92742_Broad_LINCS_gene_info.txt.gz')

l1000_df_file = os.path.join(data_path, 'l1000_df.bin')

N_GENES = 12_328

def create_l1000_df():

    os.makedirs(data_path, exist_ok=True)

    if os.path.exists(l1000_df_file):
        return load(l1000_df_file)

    if not os.path.exists(raw_counts_file):

        if not os.path.exists(raw_counts_file_compressed):
            print('Downloading counts from GEO...')
            urlretrieve(raw_counts_url, filename=raw_counts_file_compressed)

        gunzip(raw_counts_file_compressed, raw_counts_file)

    if not os.path.exists(gene_info_file):
        urlretrieve(gene_info_url, filename=gene_info_file)

    if not os.path.exists(meta_data_file):
        urlretrieve(meta_data_url, filename=meta_data_file)

    meta_df = read_excel(meta_data_file)
    sample_names = meta_df['NIH LINCS L1000 sig_id']

    gene_df = read_csv(gene_info_file, sep='\t')
    assert gene_df.shape[0] == N_GENES
    gene_df = gene_df.set_index('pr_gene_id')

    counts_df = parse.parse(raw_counts_file).data_df
    assert counts_df.shape[0] == N_GENES
    counts_df = counts_df[sample_names]
    counts_df.index = counts_df.index.astype(int)

    df = counts_df.join(gene_df, how='inner')
    assert df.shape[0] == N_GENES
    df = df.loc[df.pr_is_lm == 1, :]
    df = df.set_index('pr_gene_symbol')
    assert df.index.is_unique
    df = df[sample_names].T

    columns_in_excel_table = [
        'NIH LINCS L1000 sig_id',
        'Usage',
        'Compound Names',
        'DILIst binaray classfication ',
        'Cell_id',
        'Dose',
        'Treatment Duration (hr)']
    meta_df = meta_df[columns_in_excel_table]
    meta_df.columns = [
        'sample_id',
        'set',
        'compound',
        'dili',
        'cell_id',
        'dose',
        'duration'
    ]
    meta_df = meta_df.set_index('sample_id')
    meta_columns = meta_df.columns

    genes = df.columns.to_list()
    df = df.join(meta_df)

    dump((df, genes, meta_columns), l1000_df_file, compress=3)

    return df, genes, meta_columns

分别构建文件名字符串 meta_data_file，raw_counts_file，gene_info_file，l1000_df_file，如果不存在文件就下载对应的数据
返回构建好的数据集和列名

2.main

train_index = df['set'] == 'Training'
validation_index = df['set'] == 'Test'

X_train, y_train = df.loc[train_index, genes], df.loc[train_index, target_name]
X_validation, y_validation = df.loc[validation_index, genes], \
df.loc[validation_index, target_name]

svc = svm.LinearSVC(max_iter=50_000)
selector = feature_selection.SelectKBest(
    score_func=feature_selection.mutual_info_classif
)
estimator = pipeline.Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('selector', selector),
    ('svc', svc)
])

scaler_grid = ['passthrough', preprocessing.StandardScaler(), preprocessing.RobustScaler()]
k_grid = np.arange(10, 201, 10)
c_grid = 10. ** np.arange(-2, 3)

param_grid = {
    'scaler': scaler_grid,
    'selector__k': k_grid,
    'svc__C': c_grid
}

inner_cv = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
grid_search_file = 'grid_search.pkl'
grid_search_file = os.path.join(base_dir, grid_search_file)

mcc_scorer = metrics.make_scorer(metrics.matthews_corrcoef, greater_is_better=True)

try:
    search = joblib.load(grid_search_file)
except FileNotFoundError:
    n_configurations = np.product(list(map(len, param_grid.values())))
    n_jobs = int(os.environ.get('SLURM_CPUS_ON_NODE', max_local_threads))
    print(f'Trying {n_configurations} different configurations using {n_jobs} CPUs')
    search = model_selection.GridSearchCV(
        estimator, param_grid, cv=inner_cv, scoring=mcc_scorer, n_jobs=n_jobs,
        refit=False, return_train_score=True, verbose=2
    )
    search.fit(X_train, y_train, groups=df.loc[train_index, 'compound'])
    joblib.dump(search, grid_search_file, compress=3)

print(search.best_params_)
print(search.best_score_)

estimator.set_params(**search.best_params_)
estimator.fit(X_train, y_train)

n_selected_features = sum(estimator.named_steps['selector'].get_support())
n_features = X_train.shape[1]

print(f'{n_selected_features} out of {n_features} features used')

y_validation_predicted = estimator.predict(X_validation)
accuracy = metrics.accuracy_score(y_validation, y_validation_predicted)
mcc = metrics.matthews_corrcoef(y_validation, y_validation_predicted)
print('accuracy:', accuracy)
print('MCC:', mcc)

models = []
for train_index_num, _ in inner_cv.split(df[genes], df[target_name], groups=df['compound']):
    estimator = deepcopy(estimator)
    estimator.set_params(**search.best_params_)
    estimator.fit(df[genes].iloc[train_index_num], df[target_name].iloc[train_index_num])
    models.append(estimator)

models_file = 'models_nested.pkl' if nested_group_cv else 'models.pkl'
models_file = os.path.join(base_dir, models_file)
joblib.dump(models, models_file, compress=3)

进行特征标准化，根据互信息选择特征，使用 SVM 分类模型
model_selection.RepeatedStratifiedKFold 是 K-折交叉验证类
metrics.matthews_corrcoef 作为打分参数构建评分类
model_selection.GridSearchCV 网格搜索最优超参数
将训练好的模型保存在 models.pkl 文件中

3.create_plots

matplotlib.use('Agg')
sns.set(style='whitegrid')
sns.set_context('paper', font_scale=1.3)

current_file_path = os.path.dirname(os.path.realpath(__file__))
base_dir = os.path.join(current_file_path, os.path.pardir)

def main():

    image_folder = os.path.join(base_dir, 'figures')
    os.makedirs(image_folder, exist_ok=True)

    grid_search_file = 'grid_search.pkl'
    grid_search_file = os.path.join(base_dir, grid_search_file)
    create_grid_search_plots(grid_search_file, image_folder)

    create_data_plots(image_folder)

    create_coeff_boxplot(image_folder)

依次画出超参数选择，数据，特征贡献图

3.1.create_grid_search_plots

def create_grid_search_plots(grid_search_file, image_folder):

    search = joblib.load(grid_search_file)
    search_result_df = pd.DataFrame(search.cv_results_)

    param_name = 'param_selector__k'
    param_display_name = 'k'

    # find configuration with best test score for each k
    best_score_per_k_index = search_result_df.groupby(param_name)['mean_test_score']\
        .idxmax()
    search_result_df = search_result_df.loc[best_score_per_k_index, :]

    # convert results to long format
    param_names = ['param_scaler', 'param_selector__k', 'param_svc__C']
    train_split_names = [c for c in search_result_df.columns if
                         c.startswith('split') and c.endswith('train_score')]
    test_split_names = [c for c in search_result_df.columns if
                        c.startswith('split') and c.endswith('test_score')]
    data = []
    for index, row in search_result_df.iterrows():
        param_values = row[param_names].tolist()
        train_scores = row[train_split_names].tolist()
        test_scores = row[test_split_names].tolist()
        for train_score in train_scores:
            data.append(param_values + ['train', train_score, row.mean_train_score, index])
        for test_score in test_scores:
            data.append(param_values + ['test', test_score, row.mean_test_score, index])

    plot_data = pd.DataFrame(
        data, columns=['scaler', 'k', 'C', 'split', 'MCC', 'mean', 'index'])
    plot_data['scaler'] = plot_data['scaler'].astype(str)
    plot_data = plot_data.rename(columns={'split': 'Split'})

    fig, ax = plt.subplots(figsize=(9, 4))
    sns.lineplot(
        data=plot_data,
        x=param_display_name, y='MCC', hue='Split', hue_order=['train', 'test'], ax=ax
    )

    x_ticks = sorted(plot_data[param_display_name].unique())
    x_ticks = x_ticks[::2]
    ax.set_xticks(x_ticks)

    x = search.best_params_[param_name.replace('param_',  '')]
    y = search.best_score_
    ax.plot(x, y, '*k', markersize=15, zorder=-1, alpha=0.8,
            color=ax.lines[1].get_color())

    ax.set_xlim(plot_data[param_display_name].min(), plot_data[param_display_name].max())
    ax.set_xlabel('Number of features')
    ax.set_ylabel('Model performance (MCC)')

    image_path = os.path.join(image_folder, f'figure03_grid_search_{param_display_name}.tiff')
    save_figure(image_path)

3.2.create_data_plots

def create_data_plots(image_folder):

    df, genes, meta_columns = create_l1000_df()

    target_name = 'DILI'
    df[target_name] = df['dili'].replace({0: 'negative', 1: 'positive'})

    reduced_df = create_umap_df(df, features=genes, densmap=True)

    fig, ax = plt.subplots(figsize=(9, 6))
    scatter_plot(reduced_df, hue='DILI', alpha=0.8, s=70, ax=ax)
    image_path = os.path.join(image_folder, 'figure01_umap_DILI.tiff')
    save_figure(image_path)

    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 3))
    corr_df = df[genes].corr(method='spearman').abs()
    np.fill_diagonal(corr_df.values, np.nan)
    sns.histplot(corr_df.max(axis=1), bins=20, ax=ax1)
    ax1.set_xlabel('Maximum absolute Spearman\'s\ncorrelation between features')

    mi = mutual_info_classif(df[genes], df['dili'])
    sns.histplot(mi, bins=10, ax=ax2)
    ax2.set_xlabel('Mutual information with DILI class')
    ax2.set_ylabel(None)
    image_path = os.path.join(image_folder, 'figure02_corr_mi.tiff')
    save_figure(image_path)

3.3.create_coeff_boxplot

def create_coeff_boxplot(image_folder):

    df, genes, meta_columns = create_l1000_df()

    models_file = os.path.join(base_dir, 'models.pkl')
    models = joblib.load(models_file)
    coefficient_list = []
    for model in models:
        selected_features = model.named_steps['selector'].get_support()
        selected_features = np.array(genes)[selected_features]
        coefficients = np.abs(model.named_steps['svc'].coef_)[0]
        coefficient_list.append(
            pd.DataFrame({'Gene': selected_features, 'Coefficient': coefficients})
        )
    coefficient_df = pd.concat(coefficient_list)

    top_genes = coefficient_df.groupby('Gene').median().sort_values('Coefficient', ascending=False)
    top_genes = top_genes.head(n=5)
    plot_df = coefficient_df.loc[coefficient_df.Gene.isin(top_genes.index), :]

    fig, ax = plt.subplots(figsize=(9, 4))
    image_path = os.path.join(image_folder, 'figure04_coefficients.tiff')
    sns.boxplot(data=plot_df, x='Gene', y='Coefficient', order=top_genes.index, ax=ax)
    ax.set_ylabel('Absolute coefficient')
    save_figure(image_path)