TCGA数据挖掘(全网最详细)


前言

本文主要用于介绍TCGA初始数据的处理,数据融合,基因ID转换,数据融合以及数据的可视化!


一、数据处理

options(stringsAsFactors = F) # 将下面的文件不转化为因子
setwd('G:\\R\\TCGA')
dir.create("SampleFiles")
filepath <- dir(path = "./gdc_DL_20220311_105839.325315",full.names = TRUE)
for (wd in filepath) {
  files <- dir(path = wd,pattern = "gz$") # 查看满足条件
  fromfilepath <- paste(wd,"\\",files,sep="") # 拼接函数
  tofilepath <- paste(".\\SampleFiles\\",files,sep="")
  file.copy(fromfilepath,tofilepath)
}#将文件复制到SampleFiles中

# 解压所有文件并删除原文件
setwd("./SampleFiles")
countfields <- dir(path = ".\\",pattern = "gz$")#查看满足条件的文件
library(R.utils)#解压函数
sapply(countfields, gunzip)#将zip文件全部解压

#处理json文件
library(rjson)
metadata_json_File <- fromJSON(file = "../metadata.cart.2022-03-11.json")#“..”表示返回上一级目录
json_File_Info3 <- data.frame(filename=c(),TCGA_Barcode=c())#定义一个数据框来储存数据
for (i in 1:length(metadata_json_File)) {
  TCGA_Barcode <- metadata_json_File[[i]][["associated_entities"]][[1]][["entity_submitter_id"]]
  file_name <- metadata_json_File[[i]][["file_name"]]
  json_File_Info3 <- rbind(json_File_Info3,data.frame(filesName=file_name,TCGA_Barcode=TCGA_Barcode))
}#得到文件名与条形码对应的表格

rownames(json_File_Info3) <- json_File_Info3[,1]#将文件名变为行名
write.csv(json_File_Info3,file="../json_File_Info.csv")#写入数据,本地文件,用Excel就可以打开

二、数据融合

代码如下(示例):

filesName_To_TCGA_BarcodeFile <- json_File_Info3[-1]#删除掉filesName列
countsFileNames <- dir(pattern = "counts$")#读入所文件,获取counts矩阵

allSampleRawcounts <- data.frame()#定义一个储存数据的数据框
for (txtFile in countsFileNames) {
  # 每一个循环读取一个文件
  SampleCounts <- read.table(txtFile,header = FALSE)
  rownames(SampleCounts) <- SampleCounts[,1]
  SampleCounts <- SampleCounts[-1]
  #根据filesName_To_TCGA_BarcodeFile文件中文件的名称与barcode对应关系,命名列名
  colnames(SampleCounts) <- filesName_To_TCGA_BarcodeFile[paste(txtFile,".gz",sep=""),]
  if(dim(allSampleRawcounts)[1]==0){
    allSampleRawcounts <- SampleCounts
  }
  else{
    allSampleRawcounts <- cbind(allSampleRawcounts,SampleCounts)
  }
}
write.csv(allSampleRawcounts,file = "../allSampleRawcounts.csv")#备份文件
ensembl_id <- substr(rownames(allSampleRawcounts),1,15)#去掉版本号
rownames(allSampleRawcounts) <- ensembl_id
#RawCounts.cs文件与allSampleRawcounts文件的区别在与行名的ensenbl去掉了版本号
write.csv(allSampleRawcounts,file = "../RawCounts.csv")

3.基因ID转换

代码如下(示例):

#添加一列Ensemble_ID到RawCounts数据框中
RawCounts <- allSampleRawcounts 
Ensembl_Id <- data.frame(Ensembl_Id = row.names(RawCounts))
rownames(Ensembl_Id) <- Ensembl_Id[,1]
RawCounts <- cbind(Ensembl_Id,RawCounts)

# library(R.utils)
# unzip("gencode.v39lift37.annotation.gtf.gz") 
library(stringr)
library(readr)
gtf_in <- read_delim("../gencode.v39lift37.annotation.gtf/gencode.v39lift37.annotation.gtf",
                     "\t", escape_double = FALSE, col_names = FALSE,
                     comment = "#", trim_ws = TRUE)

gtf_in <- gtf_in[gtf_in$X3=="gene",]#第三列feature是gene的保留下来
gene_info <- data.frame(str_split_fixed(unique(gtf_in$X9),";",5))
gene_info2 <- unique(data.frame("gene_id"=str_split_fixed(gene_info$X1,'"',3)[,2],
                                "gene_name"=str_split_fixed(gene_info$X3,'"',3)[,2],
                                "gene_biotype"=str_split_fixed(gene_info$X4,'"',3)[,2]
))
#再用str_split_fixed获取引号内的内容,这样就获得了gene_id、gene_name、gene_biotype构成的基因注释表格

unempty_str_indices <- which(!gene_info2$gene_biotype == "")

gene_info2$gene_name[unempty_str_indices] <- gene_info2$gene_biotype[unempty_str_indices]
gene_info2 <- select(gene_info2,-gene_biotype)

library(dplyr)
new_gene <- substr(gene_info2[,1],1,15)#去掉版本号
gene_info2$gene_id <- new_gene
Ensembl_Id_TO_Genename <- unique(gene_info2, by = "gene_id") 
colnames(Ensembl_Id_TO_Genename) <- c("Ensembl_Id","gene_id")
# gtf_Ensembl_Id <- substr(Ensembl_Id_TO_Genename[,1],1,18)#去掉版本号
# Ensembl_Id_TO_Genename <- data.frame(gtf_Ensembl_Id,Ensembl_Id_TO_Genename[,2])
# colnames(Ensembl_Id_TO_Genename) <- c("Ensembl_Id","gene_id")
write.csv(Ensembl_Id_TO_Genename,file = "../Ensembl_Id_TO_Genename.csv")

#导入上面那个文件
# 融合数据
mergeRawCounts <- merge(Ensembl_Id_TO_Genename,RawCounts,by="Ensembl_Id")
# 按照gene_id列进行排序
mergeRawCounts <- mergeRawCounts[order(mergeRawCounts[,"gene_id"]),]
# 根据gene_id建立索引
index <- duplicated(mergeRawCounts$gene_id)
# 我们想要的哪一行为FALSE,所以要去反
mergeRawCounts <- mergeRawCounts[!index,]
# 利用基因名称作为行名
rownames(mergeRawCounts) <- mergeRawCounts[,"gene_id"]
# 删除前两行
LUAD_Counts_expMatrix <- mergeRawCounts[,-c(1:2)]
# 保存文件
write.csv(LUAD_Counts_expMatrix,file = "../LUAD_Counts_expMatrix.csv")

4.表达差异分析

library(TCGAbiolinks)
# json_File_Info3 <- read.csv('json_File_Info.csv',header = TRUE,row.names = 1)
# library(data.table)
# LUAD_Counts_expMatrix = fread(file = 'LUAD_Counts_expMatrix.csv',encoding = 'UTF-8')  #file_list[1]是文件全目录
# df <- read.csv('LUAD_Counts_expMatrix.csv',header = TRUE,row.names = 1)
# query <- GDCquery(project = 'TCGA-LUAD',
#                   data.category = 'Transcriptome Profiling',
#                   data.type = 'Gene Expression Quantification',
#                   workflow.type = 'STAR - Counts')
# save(query, file = "query.RData")
load('../query.RData')
# 600个barcode
samplesDown <- query[[1]][[1]]
samplesDown <- getResults(query,cols = c("cases"))
#533个肿瘤样本的barcode
dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,typesample = "TP")
dataSmTP<-dataSmTP[dataSmTP%in%json_File_Info3[,2]]
#59个正常组织的barcode
dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,typesample = "NT")
dataSmNT<-dataSmNT[dataSmNT%in%json_File_Info3[,2]]

#重新排序样本数据,正常组织样本在前,肿瘤样本在后,即前59列为正常
Counts <- data.frame(c(LUAD_Counts_expMatrix[,dataSmNT],LUAD_Counts_expMatrix[,dataSmTP]))
rownames(Counts)<-row.names(LUAD_Counts_expMatrix)
colnames(Counts) <-c(dataSmNT,dataSmTP)
write.csv(Counts,'Counts.csv')

 # 输出数据
# 在edger中,1代表contro1样本,2代表case样本。 
# 原始数据中有594个样本,对照有59个和实验组各533个。所以我们可以创建一个分组向量。
# 方法一 edger 
# 包的安装
# options(download.file.method="libcurl")
# BiocManager::install("edgeR")
# install.packages("limma")
library("edgeR")
#创建分组
group <- c(rep(1,59),rep(2,533))

#创建DGEList类型变量
# 这一步相当于创建一列表。注意group中的顺序和counts中行名要对应,
#也就是对照组和实验组要指定正确。这里前59列为contro1,后533列为case。
y<-DGEList(counts=Counts,group=group)

#数据过滤(去除表达量低的数据)
keep <- filterByExpr(y)
y<-y[keep,,keep.lib.sizes=FALSE]

# 计算标准化因子
y<-calcNormFactors(y)

#计算离散度
y<-estimateDisp(y)

#显著性检验
et <-exactTest(y)

# 获取排名靠前的基因,这里设置n=100000是为了输出所有基因
et <- topTags(et,n=100000)

#转换为数据框类型
et <- as.data.frame(et)

# 将行名粘贴为数据框的第一列 
et<- cbind(rownames(et),et)

# 指定列名
colnames(et)<-c("gene_id","log2FoldChange","log2CPM","PValue","FDR")

# 保存数据到本地
write.table(et,"..\\all_LUAD_DEG.xls",sep ="\t",col.names=TRUE, 
            row.names = FALSE, quote =FALSE, na="")
write.csv(et,file = "../all_LUAD_DEG.csv")


#差异基因筛选
etSig <-et[which(et$PValue < 0.05 & abs(et$log2FoldChange) > 1),]

# 加入一列,up_down 体现上下调信息

etSig[which(etSig$log2FoldChange>0), "up_or_down"] <- "Up"
etSig[which(etSig$log2FoldChange<0), "up_or_down"] <- "Down" 

new_etSig <- etSig
new_etSig[which(etSig$log2Foldchange > 0),"up_or_down"] <- "Up"

# 保存文件 
write.table(etSig,"..\\LUAD_DEG.xls",sep="\t",col.names =TRUE,row.names =FALSE,quote =FALSE,na="")
write.csv(etSig,file = "../etSig.csv")

save.image('../TCGA.RData')
load('TCGA.RData')

5.可视化

1. 筛选上下调及不显著变化的基因

all_LUAD_DEG <- read.csv('all_LUAD_DEG.csv',header = TRUE,row.names = 1)
FC = 1.5 # 用来判断上下调,一般蛋白质组的项目卡1.5
PValue = 0.05 #用来判断上下调
# 加载R包,没有安装请先安装  install.packages("包名") 
library(ggplot2)
library(ggrepel)  #用于标记的包
library(dplyr)
all_LUAD_DEG[which(all_LUAD_DEG$log2FoldChange>=log2(FC)&all_LUAD_DEG$PValue < -1*log2(PValue)), "up_or_down"] <- "Up"
all_LUAD_DEG[which(all_LUAD_DEG$log2FoldChange<=-log2(FC)&all_LUAD_DEG$PValue < -1*log2(PValue)), "up_or_down"] <- "Down"
all_LUAD_DEG[which(all_LUAD_DEG$log2FoldChange<log2(FC)&all_LUAD_DEG$log2FoldChange>-log2(FC)&all_LUAD_DEG$PValue < -1*log2(PValue)), "up_or_down"] <- "NotSig"
write.csv(all_LUAD_DEG,'./TCGA_LUAD.csv')

2.挑选top 10

top <- 10
top_genes <- bind_rows(
  all_LUAD_DEG %>% 
    filter(up_or_down == 'Up') %>% 
    arrange(FDR, desc(abs(log2FoldChange))) %>% 
    head(top),
  all_LUAD_DEG %>% 
    filter(up_or_down == 'Down') %>% 
    arrange(FDR, desc(abs(log2FoldChange))) %>% 
    head(top)
)

3.火山图

p1 <- ggplot(data = all_LUAD_DEG, 
             aes(x = all_LUAD_DEG$log2FoldChange, 
                 y = -log10(all_LUAD_DEG$PValue))) +
  geom_point(alpha = 0.4, size = 1,aes(color = up_or_down)) +
  scale_color_manual(values = c("blue", "grey", "red")) +
  geom_hline(yintercept=-log10(PValue),linetype=2) +
  geom_vline(xintercept=c(-log2(FC),log2(FC)),linetype=2) +
  # # 标记上调的前 10 个基因
  # geom_text(data = top10_up, aes(label = gene_id), color = "red", size = 3) +
  # # 标记下调的前 10 个基因
  # geom_text(data = top10_down, aes(label = gene_id), color = "blue", size = 3)
  theme_bw()

p2 <-  p1 +
  geom_label_repel(data = all_LUAD_DEG,R
                   mapping = aes(log2FoldChange, -log(FDR,10), label = gene_id),
                   size = 2)
p2
ggsave('TCGA_LUAD.png',p2,width = 14,height = 8)

请添加图片描述

4. 热图

4.1 上调前50

library(pheatmap)
all_LUAD_DEG <- all_LUAD_DEG[,-c(ncol(all_LUAD_DEG),1)]
all_LUAD_DEG <- all_LUAD_DEG[order(all_LUAD_DEG$log2FoldChange,decreasing = TRUE), ]
deg_opt <- all_LUAD_DEG %>% filter(all_LUAD_DEG$up_or_down =="Up")
deg_opt <- deg_opt[,-c(ncol(deg_opt),1)]
rolumn_names <- rownames(deg_opt)
library(data.table)
Counts = fread(file = 'Counts.csv',encoding = 'UTF-8')  #file_list[1]是文件全目录
Counts <- read.csv('Counts.csv',row.names = 1)
rownames(Counts) <- Counts$V1
result_heatmap <- Counts[rolumn_names, ]

p3 <- pheatmap(result_heatmap[1:50,], 
               # annotation_row=dfGene, # (可选)指定行分组文件
               # annotation_col=dfSample, # (可选)指定列分组文件
               show_colnames = F, # 是否显示列名
               show_rownames=F,  # 是否显示行名
               fontsize=10, # 字体大小
               color = colorRampPalette(c('#0000ff','#ffffff','#ff0000'))(50), # 指定热图的颜色
               annotation_legend=TRUE, # 是否显示图例
               border_color=NA,  # 边框颜色 NA表示没有
               scale="column",  # 指定归一化的方式。"row"按行归一化,"column"按列归一化,"none"不处理
               cluster_rows = TRUE, # 是否对行聚类
               cluster_cols = TRUE # 是否对列聚类
)
p3

请添加图片描述

4.2 下调50

library(pheatmap)
all_LUAD_DEG <- all_LUAD_DEG[order(all_LUAD_DEG$log2FoldChange,decreasing = TRUE), ]
deg_opt <- all_LUAD_DEG %>% filter(all_LUAD_DEG$up_or_down =="Down")
deg_opt <- deg_opt[,-c(ncol(deg_opt),1)]
rolumn_names <- rownames(deg_opt)
# library(data.table)
# Counts = fread(file = 'Counts.csv',encoding = 'UTF-8')  #file_list[1]是文件全目录
# rownames(Counts) <- Counts$V1
Counts <- read.csv('Counts.csv',row.names = 1)
result_heatmap <- Counts[rolumn_names, ]

p4 <- pheatmap(result_heatmap[1:50,], 
               # annotation_row=dfGene, # (可选)指定行分组文件
               # annotation_col=dfSample, # (可选)指定列分组文件
               show_colnames = F, # 是否显示列名
               show_rownames=T,  # 是否显示行名
               fontsize=5, # 字体大小
               color = colorRampPalette(c('#0000ff','#ffffff','#ff0000'))(50), # 指定热图的颜色
               annotation_legend=TRUE, # 是否显示图例
               border_color=NA,  # 边框颜色 NA表示没有
               scale="column",  # 指定归一化的方式。"row"按行归一化,"column"按列归一化,"none"不处理
               cluster_rows = TRUE, # 是否对行聚类
               cluster_cols = TRUE # 是否对列聚类
)
p4
ggsave('pheatmap_Dowm.png',p3,width = 14,height = 8)
save.image('TCGA.RData')

请添加图片描述


总结

原始数据比较大,就不放在这里了,有需要的朋友私我领取!!!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

莘薪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值