xiximiRNAs是一类非编码短序列RNA分子(长度大约20nt)。miRNAs虽然不能够直接编码蛋白质,但是能够通过与靶mRNA的结合影响其表达和翻译,进而影响蛋白质的产量。已有研究表明miRNAs的差异表达与包括癌症在内的多种疾病有着密切的关系。采用机器学习算法,基于miRNAs表达谱数据,可实现多种癌症分类以及癌症早晚期分类。
项目将提供数据集和源码,可能处在审核阶段,可几日后访问本人的资源库自行获取:
pancanMiRs__08_04_16.txt ----------------miRNA表达矩阵(维度743*10824),共有10824个样本,每个样本有743个miRNAs特征。
TCGA_phenotype.tsv --------------------------样本表型数据(维度12804*4),如果是01-09之间的数表示该样本数据是来源于癌症组织,如果编码是大于10的数表示该样本是来源于癌旁正常组织。
分别读取pancanMiRs__08_04_16.txt与TCGA_phenotype.tsv文件中的数据,并分别存放在df_miRNA与df_phenotype中。同时筛选发病组织名称与数量并输出为新的tsv表格
仅保留同时存在df_miRNA与df_phenotype中的样本,并将修改后的结果重新返回df_miRNA与df_phenotype中,输出对应组织样本的标签
将高维数据通过降维技术降到三维,并在三维坐标平面上展示样本的分布情况。
将癌症样本数量较少(_num_cancer<200)的癌症数据样本删除(癌症样本及其癌旁正常组织样本)。
最后选择偏好的算法可以轻松实现预测,下面提供几个不同版本的降维和算法实现路径:
import pandas as pd #可以使用pandas模块来存储多行多列数据
import numpy as np
from sklearn.manifold import TSNE
from sklearn.datasets import load_iris,load_digits
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
from sklearn.manifold import TSNE
from sklearn.datasets import load_iris,load_digits
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
aa = pd.read_table('D:/Users/Downloads/pancanMiRs__08_04_16.txt', index_col=0, header=0, sep='\t') #读入数据到数据框aa中
print(aa)
bb = pd.read_table('D:/Users/Desktop/TCGA_phenotype.txt',index_col=0,encoding='utf-16')
print(bb)
yangming=aa.columns #返回列标签
yangming1=bb.index #返回列标签
#tong=[None]*bb.shape[0] #shape[0]行数
tong=[None]*aa.shape[1]
tongji=[None]*aa.shape[1]
for i in range(10824): #双循环确定
if int(yangming[i][13]) > 0:
for j in range(12804):
if yangming1[j] == yangming[i] : #当二者相等时
cn=bb.iloc[j,2]
tong[i]=bb.iloc[j,2]
tongji[i]='Normal ' + cn
else:
for j in range(12804):
if yangming1[j] == yangming[i]:
tong[i]=bb.iloc[j,2]
tongji[i]=bb.iloc[j,2]
tong_ge=pd.Series(tong).value_counts() #统计癌症种类1(无含normal)
tong_geshu=tong_ge.to_frame(name='_num_cancer') #加上第一列列名
tong_geshu.loc[:," _num_normal"]=tong_geshu.iloc[:] #第二列列名
#print(len(tong_geshu.index))
#print(tong_geshu)
tongji_geshu=pd.Series(tongji).value_counts() #统计癌症种类2,用Series
#print(tongji_geshu)
zhonglei1=tong_geshu.index
print(len(tongji_geshu.index))
for i in range(len(zhonglei1)-1): #循环一一对应
tong_geshu.iloc[i, 0] = tongji_geshu[tongji_geshu.index[i]]
try:
tong_geshu.iloc[i,1]=tongji_geshu['Normal '+tongji_geshu.index[i]]
except:
tong_geshu.iloc[i, 1]=0
print(tong_geshu)
X_tsne=TSNE(n_components=2,perplexity=15,random_state=33).fit_transform(aa.T) #tsne降维
X_pca=PCA(n_components=2).fit_transform(aa.T) #pca降维
ckpt_dir="images"
if not os.path.exists(ckpt_dir): #创建一个用于保存图像的目录
os.makedirs(ckpt_dir)
plt.figure(figsize=(10,5)) #创建一个10*5的图形窗口
plt.subplot(121)
plt.scatter(X_tsne[:,0],X_tsne[:,1])
plt.title('t-SNE')
plt.xlabel('t-SNE Dimension1')
plt.ylabel('t-SNE Dimension2')
plt.subplot(122)
plt.scatter(X_pca[:,0],X_pca[:,1])
plt.title('PCA')
plt.xlabel('PCA Dimension1')
plt.ylabel('PCA Dimension2')
plt.savefig('images/pca_vs_tsne.png',dpi=120)#保存到images目录下的文件
print(bb)
print(bb.iloc[:,1])
plt.show()#显示图像
shan=tong_geshu[tong_geshu.iloc[:,0]<200]
drop_sample_index_list = bb.index[bb['_primary_disease'].isin(shan)].tolist()
aa = aa.drop(columns=drop_sample_index_list) #去除多余样本
tong_geshu=tong_geshu[tong_geshu.iloc[:,0]>=200]
X_tsne=TSNE(n_components=2,perplexity=15,random_state=33).fit_transform(aa.T) #tsne降维
X_pca=PCA(n_components=2).fit_transform(aa.T) #pca降维
ckpt_dir="images"
if not os.path.exists(ckpt_dir): #创建一个用于保存图像的目录
os.makedirs(ckpt_dir)
plt.figure(figsize=(10,5)) #创建一个10*5的图形窗口
plt.subplot(121)
plt.scatter(X_tsne[:,0],X_tsne[:,1])
plt.title('t-SNE')
plt.xlabel('t-SNE Dimension1')
plt.ylabel('t-SNE Dimension2')
plt.subplot(122)
plt.scatter(X_pca[:,0],X_pca[:,1])
plt.title('PCA')
plt.xlabel('PCA Dimension1')
plt.ylabel('PCA Dimension2')
plt.savefig('images/pca_vs_tsne.png',dpi=120)#保存到images目录下的文件
print(bb)
print(bb.iloc[:,1])
plt.show()#显示图像
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
#数据集划分
x=aa.T #miRNA表达数据,行是样本,列是特征
y=bb.iloc[:,1] #样本数值型标签
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)
#模型选择及训练
model_RFC=RandomForestClassifier()
model_RFC.fit(x_train,y_train)
#预测
y_pred_RFC=model_RFC.predict(x_test)
#将预测结果与真实结果放在一起,方便查看分析
out_pred=pd.DataFrame()
out_pred['y_test']=y_test
out_pred['y_pred']=y_pred_RFC
#分类结果展示-混淆矩阵
result=confusion_matrix(y_test, y_pred_RFC)
print("Confusion Matrix")
print(result)
#分类结果展示-(f1 score,precision, accuracy等指标)
result1=classification_report(y_test,y_pred_RFC)
print('classification report')
print(result1)
下面提供上述图中带多个降维的代码:
import pandas as pd
pancan_mirs_df = pd.read_csv('pancanMiRs__08_04_16.tsv', sep='\t', index_col=0)
phenotype_df = pd.read_csv('TCGA_phenotype.tsv', sep='\t')
sample_disease_map = {}
# 遍历列名
for sample in pancan_mirs_df.columns:
last_two_digits = sample[-2:]
# 在TCGA中找到对应的sample和疾病信息
matching_phenotype = phenotype_df[phenotype_df['sample'] == sample]
if not matching_phenotype.empty:
disease = matching_phenotype.iloc[0]['_primary_disease']
if int(last_two_digits) < 10:
sample_disease_map[sample] = disease
else:
# 是癌旁正常组织则添加"Normal "
sample_disease_map[sample] = f"Normal {disease}"
result_df = pd.DataFrame.from_dict(sample_disease_map, orient='index', columns=['Disease'])
result_df.to_csv('sample_disease_mapping.tsv', sep='\t')
import pandas as pd
# 读取sample_disease_mapping.tsv文件
df = pd.read_csv('sample_disease_mapping.tsv', sep='\t')
# 初始化一个字典来存储疾病名称及其对应的正常和发病样本计数
disease_counts = {}
# 遍历DataFrame的每一行
for index, row in df.iterrows():
disease = row['Disease']
# 判断是正常样本还是发病样本
if 'normal' in disease.lower():
# 如果包含'normal',则视为正常样本
if disease not in disease_counts:
disease_counts[disease] = {'normal': 1, 'disease': 0}
else:
disease_counts[disease]['normal'] += 1
else:
# 否则,视为发病样本
disease_base = disease.split(' ')[:-1] # 去掉可能的样本编号等后缀
disease_base = ' '.join(disease_base)
normal_disease = f'normal {disease_base}'
if disease not in disease_counts:
disease_counts[disease] = {'normal': 0, 'disease': 1}
else:
disease_counts[disease]['disease'] += 1
# 确保对应的正常疾病也记录在字典中
if normal_disease not in disease_counts:
disease_counts[normal_disease] = {'normal': 0, 'disease': 0}
disease_counts[normal_disease]['normal'] = disease_counts[disease]['disease'] # 发病样本对应的正常样本相等
# 将统计结果转换为DataFrame
result_df = pd.DataFrame(disease_counts).T # 转置以得到正确的格式
result_df = result_df[['normal', 'disease']] # 确保列的顺序
# 写入新的TSV文件
result_df.to_csv('disease_counts.tsv', sep='\t', index=True, header=True)
import pandas as pd
# 读取TSV文件
df = pd.read_csv('sample_disease_mapping.tsv', sep='\t', usecols=['Disease'])
# 初始化一个空字典来记录疾病统计信息
disease_stats = {}
# 遍历DataFrame中的每一行
for index, row in df.iterrows():
disease_name = row['Disease']
# 检查是否为正常组织样本
if 'normal' in disease_name:
# 去掉'normal '部分以获取基础疾病名称
base_disease_name = disease_name.replace('normal ', '')
# 在统计信息中增加正常组织的计数
if base_disease_name in disease_stats:
disease_stats[base_disease_name]['normal'] += 1
else:
disease_stats[base_disease_name] = {'normal': 1, 'disease': 0}
else:
# 对于发病组织样本,直接增加计数
if disease_name in disease_stats:
disease_stats[disease_name]['disease'] += 1
else:
disease_stats[disease_name] = {'normal': 0, 'disease': 1}
# 将统计信息转换为DataFrame
stats_df = pd.DataFrame(disease_stats).T
stats_df = stats_df[['normal', 'disease']] # 调整列的顺序
# 输出到新的TSV文件
stats_df.to_csv('disease_statistics.tsv', sep='\t')
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import hashlib
# 读取数据
data = pd.read_csv('pancanMiRs__08_04_16.tsv', sep='\t', header=None)
# 分离特征数据和样本名称
sample_ids = data.iloc[0, 1:].values # 提取样本名称
features = data.iloc[1:, 1:].values.T # 提取特征数据并转置
features = features.astype(float) # 确保特征数据是浮点数类型
# 为每个样本生成一个基于名称哈希的颜色
def get_color_from_name(name):
hash_object = hashlib.md5(name.encode())
hex_dig = hash_object.hexdigest()[:6] # 取前6个字符生成哈希值
rgb = tuple(int(hex_dig[i:i+2], 16) for i in (0, 2, 4)) # 将哈希值转换为RGB元组
# 将RGB值归一化
normalized_rgb = tuple(color / 255 for color in rgb)
return normalized_rgb + (1,) # 添加alpha通道
# 生成颜色列表
colors = [get_color_from_name(name) for name in sample_ids]
# 使用PCA进行降维
pca = PCA(n_components=3)
X_pca_3d = pca.fit_transform(features)
# 绘制PCA三维降维后的结果
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# 绘制三维散点图,使用归一化后的颜色
sc = ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], s=5, c=colors)
# 设置标签和标题
ax.set_xlabel('PCA Feature 1')
ax.set_ylabel('PCA Feature 2')
ax.set_zlabel('PCA Feature 3')
ax.set_title('3D PCA Visualization of pancanMiRs data with Color-coded Samples')
# 显示图形
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# 加载TSV文件
data = pd.read_csv('pancanMiRs__08_04_16.tsv', sep='\t', index_col=0) #样本ID
X = data.iloc[:, :-1] # 特征数据
y = data.iloc[:, -1] # 目标值数据
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 选择随机森林回归器
model = RandomForestRegressor(n_estimators=100, random_state=42)
# 训练模型
model.fit(X_train, y_train)
# 进行预测
y_pred = model.predict(X_test)
# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")