对 CircR2Disease v2.0 进行数据清洗产生circRNA-disease关联矩阵的方法
circRNA-disease关联矩阵 是通过机器学习方法和深度学习方法预测与疾病存有潜在关联的circRNA所必须要的数据材料, 借此,基于最新记录关联数据的数据库 CircR2Disease v2.0 向大家介绍一种通过pandas进行数据清洗产生circRNA-disease 关联矩阵的方法。实现的步骤具体需要如下几步,后边会展开细节叙述。
-
下载所需数据库 - CircR2Disease v2.0,是陕西师范大学,雷秀娟教授,于2021年11月24日在Genomics, Proteomics & Bioinformatics(IF=7.691)刊上发表的记录circRNA与disease关系的数据库,并提供在线查询,文档下载等功能。其文章链接为https://www.sciencedirect.com/science/article/pii/S1672022921002461。其统计的数据库链接为 http://bioinfo.snnu.edu.cn/CircR2Disease_v2.0。在这篇博客中,我们只需要它 “The circRNA-disease entries.xlsx”中的文档数据。如下图所示。
-
筛除冗余 - 从文档中筛去于人类疾病无关的数据统计,只保留与人类疾病相关的数据。
-
pandas进行数据处理 - 鉴于pandas对excel数据处理优势,即可用circRNA和disease名字作为行列索引,在此使用pandas进行代码编写。
文章目录
1、删除冗余数据
我们可以看到 “The circRNA-disease entries.xlsx” 表格统计中的“Species”一列中既有 human,还有 mouse,rat的数据统计,根据需求,我们就只需要human的数据统计。那我们的一个思路就是只保留human字段所在的行。也可理解为第一步数据清洗。其处理的代码如下:
Code block 1
import pandas as pd
from pandas import DataFrame as df
data = pd.read_excel('.\data\The circRNA-disease entries.xlsx')
data = df(data)
# 获得Human字段所在的行索引
index_human = data[(data.Species == 'Human')].index.tolist()
a = data.iloc[index_human, :] # 使用iloc函数将数据块提取出
a.to_excel(
r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\inputData\The circRNA-disease entries of Human(2).xlsx',
index=False, header=True, engine='xlsxwriter')
处理过后的表格如下:可见只保留了Human的数据统计。
2、 根据 circRNA Name 和 Disease Name 数据计算circRNA-disease邻接矩阵
2.1、唯一化 circRNA 名字 和 Disease 名字,用来作为关联矩阵的行,列索引。
Code block 2
# 获取circRNA和disease的唯一索引
def getUniqueIndex(self, data):
# 去重
circRNAs = data['circRNA Name'].drop_duplicates()
diseases = data['Disease Name'].drop_duplicates()
return list(circRNAs), list(diseases)
2.2、获取每个circRNA所关联的疾病,用列表存储。
Code block 3
# 获取与单个circRNA所关联的疾病
def search_related_disease(self, circRNA_target):
diseases_relateWithCircRNA_list = []
for index_num in range(self.data.shape[0]):
row_data = self.data.iloc[index_num, :]
if row_data[0] == circRNA_target:
diseases_relateWithCircRNA_list.append(row_data[1])
return diseases_relateWithCircRNA_list
2.3、初步计算circRNA-disease关联矩阵,保存为“ori_association_matrix.xlsx”。
Code block 4
def construct_ori_association(self) -> pd.DataFrame:
circRNAs, diseases = self.getUniqueIndex(self.data)
ori_association_matrix = pd.DataFrame(index=circRNAs, columns=diseases)
ori_association_matrix.fillna(value=0, inplace=True)
for circRNA in circRNAs:
dis_list = self.search_related_disease(circRNA)
print(dis_list)
for disease in dis_list:
ori_association_matrix.loc[circRNA, disease] = 1
ori_association_matrix.to_excel(
r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\ori_association_matrix.xlsx')
return ori_association_matrix
3、对初步产生的circRNA-disease关联矩阵进行数据密集规范化处理。
3.1、设置dropRate,将行或者列关联数小于dropRate的进行删除
Code block 5
# 对列进行密集处理
def drop_columns_of_ori_associationMatrix(self):
sum_of_cols = self.ori_association_matrix.apply(lambda c: c.sum(), axis=0)
sum_of_cols = sum_of_cols.sort_values() # 升序
# threshold_value = sum_of_cols.iloc[int(sum_of_cols.shape[0] * self.dropColumns_threshold)] # 用阈值率计算删除数量
threshold_value = self.dropColumns_threshold
print('threshold_value_col:', threshold_value)
for column in self.ori_association_matrix.columns.tolist():
print(column)
if sum_of_cols.loc[column] < threshold_value:
print('sum_of_cols.loc[column]:', sum_of_cols.loc[column])
self.ori_association_matrix.drop(columns=column, inplace=True)
print('del_col,Ture')
Code block 6
# 对行密集处理
def drop_rows_of_ori_associationMatrix(self):
sum_of_rows = self.ori_association_matrix.apply(lambda r: r.sum(), axis=1)
sum_of_rows = sum_of_rows.sort_values()
print(sum_of_rows)
print('threshold_value_col:', self.dropRows_threshold)
for row in self.ori_association_matrix.index.tolist():
print(row)
if sum_of_rows.loc[row] < self.dropRows_threshold:
print('sum_of_rows.loc[row]:', sum_of_rows.loc[row])
self.ori_association_matrix.drop(index=row, inplace=True)
print('del_row,Ture')
4、进行密集处理之后就是我们所需要的关联矩阵,现附上所有处理的代码。
Code block 7
import pandas as pd
class DataPreprocessing():
def __init__(self):
self.data_ori_way = r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\inputData\The circRNA-disease entries of Human(2).xlsx'
# self.dropRows_threshold_rate = 0.25
# self.dropColumns_threshold_rate = 0.25
self.dropRows_threshold = 1
self.dropColumns_threshold = 4
self.data = self.load_data(self.data_ori_way)
self.ori_association_matrix = self.construct_ori_association()
def load_data(self, path) -> pd.DataFrame:
data_ori = pd.read_excel(path)
data = data_ori[['circRNA Name', 'Disease Name']]
return data
# 获取circRNA和disease的唯一索引
def getUniqueIndex(self, data):
circRNAs, diseases = data['circRNA Name'].drop_duplicates(), data['Disease Name'].drop_duplicates()
return list(circRNAs), list(diseases)
# 获取与单个circRNA所关联的疾病
def search_related_disease(self, circRNA_target):
diseases_relateWithCircRNA_list = []
for index_num in range(self.data.shape[0]):
row_data = self.data.iloc[index_num, :]
if row_data[0] == circRNA_target:
diseases_relateWithCircRNA_list.append(row_data[1])
return diseases_relateWithCircRNA_list
def construct_ori_association(self) -> pd.DataFrame:
circRNAs, diseases = self.getUniqueIndex(self.data)
ori_association_matrix = pd.DataFrame(index=circRNAs, columns=diseases)
ori_association_matrix.fillna(value=0, inplace=True)
for circRNA in circRNAs:
dis_list = self.search_related_disease(circRNA)
print(dis_list)
for disease in dis_list:
ori_association_matrix.loc[circRNA, disease] = 1
ori_association_matrix.to_excel(
r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\ori_association_matrix.xlsx')
return ori_association_matrix
def drop_columns_of_ori_associationMatrix(self):
sum_of_cols = self.ori_association_matrix.apply(lambda c: c.sum(), axis=0)
sum_of_cols = sum_of_cols.sort_values() # 升序
# threshold_value = sum_of_cols.iloc[int(sum_of_cols.shape[0] * self.dropColumns_threshold)] # 用阈值率计算删除数量
threshold_value = self.dropColumns_threshold
print('threshold_value_col:', threshold_value)
for column in self.ori_association_matrix.columns.tolist():
print(column)
if sum_of_cols.loc[column] < threshold_value:
print('sum_of_cols.loc[column]:', sum_of_cols.loc[column])
self.ori_association_matrix.drop(columns=column, inplace=True)
print('del_col,Ture')
# return self.ori_association_matrix
def drop_rows_of_ori_associationMatrix(self):
sum_of_rows = self.ori_association_matrix.apply(lambda r: r.sum(), axis=1)
sum_of_rows = sum_of_rows.sort_values()
print(sum_of_rows)
print('threshold_value_col:', self.dropRows_threshold)
for row in self.ori_association_matrix.index.tolist():
print(row)
if sum_of_rows.loc[row] < self.dropRows_threshold:
print('sum_of_rows.loc[row]:', sum_of_rows.loc[row])
self.ori_association_matrix.drop(index=row, inplace=True)
print('del_row,Ture')
if __name__ == '__main__':
DP = DataPreprocessing()
# 先删除列
DP.drop_columns_of_ori_associationMatrix()
# 后删除行
DP.drop_rows_of_ori_associationMatrix()
association_del_col_row = DP.ori_association_matrix
circRNA_name = association_del_col_row.index.tolist()
df_c = pd.DataFrame(circRNA_name, columns=['circRNA Name'])
print(circRNA_name)
df_c.to_excel(
r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\circRNAName.xlsx', index=False)
disease_name = association_del_col_row.columns.tolist()
df_d = pd.DataFrame(disease_name, columns=['disease Name'])
print(disease_name)
df_d.to_excel(
r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\diseaseName.xlsx', index=False)
association_del_col_row.to_excel(
r'F:\pycharmspace\pycharm-project\daily_dome\data_preprocessing_2\outputData_2\associationMatrix.xlsx',
index=False, header=False)
# 关联数统计
count = 0
for i in range(association_del_col_row.shape[0]):
for j in range(association_del_col_row.shape[1]):
if association_del_col_row.iloc[i, j] == 1:
count = count + 1
print(association_del_col_row)
print('关联数:', count)
产生的关联矩阵结果展示:
5、 Feedback & Bug Report
- WeChat: Yc-820558941
- Email: yangchengyjs@163.com
完结撒花!