对 CircR2Disease v2.0 进行数据清洗产生circRNA-disease关联矩阵的方法

对 CircR2Disease v2.0 进行数据清洗产生circRNA-disease关联矩阵的方法

circRNA-disease关联矩阵 是通过机器学习方法和深度学习方法预测与疾病存有潜在关联的circRNA所必须要的数据材料, 借此,基于最新记录关联数据的数据库 CircR2Disease v2.0 向大家介绍一种通过pandas进行数据清洗产生circRNA-disease 关联矩阵的方法。实现的步骤具体需要如下几步,后边会展开细节叙述。

  • 下载所需数据库 - CircR2Disease v2.0,是陕西师范大学,雷秀娟教授,于2021年11月24日在Genomics, Proteomics & Bioinformatics(IF=7.691)刊上发表的记录circRNA与disease关系的数据库,并提供在线查询,文档下载等功能。其文章链接为https://www.sciencedirect.com/science/article/pii/S1672022921002461。其统计的数据库链接为 http://bioinfo.snnu.edu.cn/CircR2Disease_v2.0。在这篇博客中,我们只需要它 “The circRNA-disease entries.xlsx”中的文档数据。如下图所示。
    在这里插入图片描述

  • 筛除冗余 - 从文档中筛去于人类疾病无关的数据统计,只保留与人类疾病相关的数据。

  • pandas进行数据处理 - 鉴于pandas对excel数据处理优势,即可用circRNA和disease名字作为行列索引,在此使用pandas进行代码编写。


1、删除冗余数据

我们可以看到 “The circRNA-disease entries.xlsx” 表格统计中的“Species”一列中既有 human,还有 mouse,rat的数据统计,根据需求,我们就只需要human的数据统计。那我们的一个思路就是只保留human字段所在的行。也可理解为第一步数据清洗。其处理的代码如下:

Code block 1
import pandas as pd
from pandas import DataFrame as df

data = pd.read_excel('.\data\The circRNA-disease entries.xlsx')
data = df(data)
# 获得Human字段所在的行索引
index_human = data[(data.Species == 'Human')].index.tolist()
a = data.iloc[index_human, :]  # 使用iloc函数将数据块提取出
a.to_excel(
    r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\inputData\The circRNA-disease entries of Human(2).xlsx',
    index=False, header=True, engine='xlsxwriter')

处理过后的表格如下:可见只保留了Human的数据统计。
在这里插入图片描述

2、 根据 circRNA Name 和 Disease Name 数据计算circRNA-disease邻接矩阵

2.1、唯一化 circRNA 名字 和 Disease 名字,用来作为关联矩阵的行,列索引。
Code block 2
# 获取circRNA和disease的唯一索引
    def getUniqueIndex(self, data):
		# 去重
        circRNAs = data['circRNA Name'].drop_duplicates() 
        diseases = data['Disease Name'].drop_duplicates()
        return list(circRNAs), list(diseases)
2.2、获取每个circRNA所关联的疾病,用列表存储。
Code block 3
# 获取与单个circRNA所关联的疾病
    def search_related_disease(self, circRNA_target):
        diseases_relateWithCircRNA_list = []
        for index_num in range(self.data.shape[0]):
            row_data = self.data.iloc[index_num, :]
            if row_data[0] == circRNA_target:
                diseases_relateWithCircRNA_list.append(row_data[1])
        return diseases_relateWithCircRNA_list

2.3、初步计算circRNA-disease关联矩阵,保存为“ori_association_matrix.xlsx”。
Code block 4
    def construct_ori_association(self) -> pd.DataFrame:
        circRNAs, diseases = self.getUniqueIndex(self.data)
        ori_association_matrix = pd.DataFrame(index=circRNAs, columns=diseases)
        ori_association_matrix.fillna(value=0, inplace=True)
        for circRNA in circRNAs:
            dis_list = self.search_related_disease(circRNA)
            print(dis_list)
            for disease in dis_list:
                ori_association_matrix.loc[circRNA, disease] = 1
        ori_association_matrix.to_excel(
            r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\ori_association_matrix.xlsx')
        return ori_association_matrix

3、对初步产生的circRNA-disease关联矩阵进行数据密集规范化处理。

3.1、设置dropRate,将行或者列关联数小于dropRate的进行删除
Code block 5
# 对列进行密集处理
 def drop_columns_of_ori_associationMatrix(self):
     sum_of_cols = self.ori_association_matrix.apply(lambda c: c.sum(), axis=0)
     sum_of_cols = sum_of_cols.sort_values()  # 升序
     # threshold_value = sum_of_cols.iloc[int(sum_of_cols.shape[0] * self.dropColumns_threshold)]    # 用阈值率计算删除数量
     threshold_value = self.dropColumns_threshold
     print('threshold_value_col:', threshold_value)
     for column in self.ori_association_matrix.columns.tolist():
         print(column)
         if sum_of_cols.loc[column] < threshold_value:
             print('sum_of_cols.loc[column]:', sum_of_cols.loc[column])
             self.ori_association_matrix.drop(columns=column, inplace=True)
             print('del_col,Ture')
Code block 6
# 对行密集处理
def drop_rows_of_ori_associationMatrix(self):
    sum_of_rows = self.ori_association_matrix.apply(lambda r: r.sum(), axis=1)
    sum_of_rows = sum_of_rows.sort_values()
    print(sum_of_rows)
    print('threshold_value_col:', self.dropRows_threshold)
    for row in self.ori_association_matrix.index.tolist():
        print(row)
        if sum_of_rows.loc[row] < self.dropRows_threshold:
            print('sum_of_rows.loc[row]:', sum_of_rows.loc[row])
            self.ori_association_matrix.drop(index=row, inplace=True)
            print('del_row,Ture')

4、进行密集处理之后就是我们所需要的关联矩阵,现附上所有处理的代码。

Code block 7
import pandas as pd


class DataPreprocessing():
    def __init__(self):
        self.data_ori_way = r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\inputData\The circRNA-disease entries of Human(2).xlsx'
        # self.dropRows_threshold_rate = 0.25
        # self.dropColumns_threshold_rate = 0.25
        self.dropRows_threshold = 1
        self.dropColumns_threshold = 4

        self.data = self.load_data(self.data_ori_way)
        self.ori_association_matrix = self.construct_ori_association()

    def load_data(self, path) -> pd.DataFrame:
        data_ori = pd.read_excel(path)
        data = data_ori[['circRNA Name', 'Disease Name']]
        return data

    # 获取circRNA和disease的唯一索引
    def getUniqueIndex(self, data):
        circRNAs, diseases = data['circRNA Name'].drop_duplicates(), data['Disease Name'].drop_duplicates()
        return list(circRNAs), list(diseases)

    # 获取与单个circRNA所关联的疾病
    def search_related_disease(self, circRNA_target):
        diseases_relateWithCircRNA_list = []
        for index_num in range(self.data.shape[0]):
            row_data = self.data.iloc[index_num, :]
            if row_data[0] == circRNA_target:
                diseases_relateWithCircRNA_list.append(row_data[1])
        return diseases_relateWithCircRNA_list

    def construct_ori_association(self) -> pd.DataFrame:
        circRNAs, diseases = self.getUniqueIndex(self.data)
        ori_association_matrix = pd.DataFrame(index=circRNAs, columns=diseases)
        ori_association_matrix.fillna(value=0, inplace=True)
        for circRNA in circRNAs:
            dis_list = self.search_related_disease(circRNA)
            print(dis_list)
            for disease in dis_list:
                ori_association_matrix.loc[circRNA, disease] = 1
        ori_association_matrix.to_excel(
            r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\ori_association_matrix.xlsx')
        return ori_association_matrix

    def drop_columns_of_ori_associationMatrix(self):
        sum_of_cols = self.ori_association_matrix.apply(lambda c: c.sum(), axis=0)
        sum_of_cols = sum_of_cols.sort_values()  # 升序
        # threshold_value = sum_of_cols.iloc[int(sum_of_cols.shape[0] * self.dropColumns_threshold)]    # 用阈值率计算删除数量
        threshold_value = self.dropColumns_threshold
        print('threshold_value_col:', threshold_value)
        for column in self.ori_association_matrix.columns.tolist():
            print(column)
            if sum_of_cols.loc[column] < threshold_value:
                print('sum_of_cols.loc[column]:', sum_of_cols.loc[column])
                self.ori_association_matrix.drop(columns=column, inplace=True)
                print('del_col,Ture')
        # return self.ori_association_matrix

    def drop_rows_of_ori_associationMatrix(self):
        sum_of_rows = self.ori_association_matrix.apply(lambda r: r.sum(), axis=1)
        sum_of_rows = sum_of_rows.sort_values()
        print(sum_of_rows)
        print('threshold_value_col:', self.dropRows_threshold)
        for row in self.ori_association_matrix.index.tolist():
            print(row)
            if sum_of_rows.loc[row] < self.dropRows_threshold:
                print('sum_of_rows.loc[row]:', sum_of_rows.loc[row])
                self.ori_association_matrix.drop(index=row, inplace=True)
                print('del_row,Ture')


if __name__ == '__main__':
    DP = DataPreprocessing()

    # 先删除列
    DP.drop_columns_of_ori_associationMatrix()
    # 后删除行
    DP.drop_rows_of_ori_associationMatrix()

    association_del_col_row = DP.ori_association_matrix

    circRNA_name = association_del_col_row.index.tolist()
    df_c = pd.DataFrame(circRNA_name, columns=['circRNA Name'])
    print(circRNA_name)
    df_c.to_excel(
        r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\circRNAName.xlsx', index=False)

    disease_name = association_del_col_row.columns.tolist()
    df_d = pd.DataFrame(disease_name, columns=['disease Name'])
    print(disease_name)
    df_d.to_excel(
        r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\diseaseName.xlsx', index=False)

    association_del_col_row.to_excel(
        r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\associationMatrix.xlsx',
        index=False, header=False)

    # 关联数统计
    count = 0
    for i in range(association_del_col_row.shape[0]):
        for j in range(association_del_col_row.shape[1]):
            if association_del_col_row.iloc[i, j] == 1:
                count = count + 1

    print(association_del_col_row)
    print('关联数:', count)

产生的关联矩阵结果展示:
在这里插入图片描述

5、 Feedback & Bug Report


完结撒花!

  • 6
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值