TCGA 数据下载 —— TCGAbiolinks 数据分析
文章目录
前言
前面,我们介绍了如何获取 TCGA
的各种数据。在获取到数据之后,我们就可以进行数据分析及分析结果的可视化了
TCGAbiolinks
也提供了一些列的函数,通过封装一些常用的算法来简化分析的流程。例如差异基因、富集分析、生存分析等
先导入依赖包
library(TCGAbiolinks)
library(SummarizedExperiment)
library(tidyverse)
数据分析
表达谱处理
我们可以很容易地获取到基因表达矩阵
# 定义需要分析的样本编号(barcodes)
listSamples <- c(
"TCGA-E9-A1NG-11A-52R-A14M-07","TCGA-BH-A1FC-11A-32R-A13Q-07",
"TCGA-A7-A13G-11A-51R-A13Q-07","TCGA-BH-A0DK-11A-13R-A089-07",
"TCGA-E9-A1RH-11A-34R-A169-07","TCGA-BH-A0AU-01A-11R-A12P-07",
"TCGA-C8-A1HJ-01A-11R-A13Q-07","TCGA-A7-A13D-01A-13R-A12P-07",
"TCGA-A2-A0CV-01A-31R-A115-07","TCGA-AQ-A0Y5-01A-11R-A14M-07"
)
# 查询 Illumina HiSeq 平台的数据
query <- GDCquery(
project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
barcode = listSamples
)
# 下载 IlluminaHiSeq_RNASeqV2 数据
GDCdownload(query)
# 将表达谱处理成:行为 geneID 列为 samples (barcode)
BRCA.Rnaseq.SE <- GDCprepare(query)
# count 数,不区分链方向,即比对上的两条链上数据之和
BRCAMatrix <- assay(BRCA.Rnaseq.SE, "unstranded")
绘制样本相关性热图和箱线图
BRCA.RNAseq_CorOutliers <- TCGAanalyze_Preprocessing(BRCA.Rnaseq.SE, filename = "~/Downloads/demo.jpg")
对数据进行标准化及过滤
# 需要使用到 EDASeq 包
library(EDASeq)
# 对基因进行标准化
dataNorm <- TCGAanalyze_Normalization(
tabDF = BRCA.RNAseq_CorOutliers,
geneInfo = geneInfoHT
)
# 过滤低表达的基因
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25
)
绘制热图
绘制差异基因热图
TCGAvisualize_Heatmap(
data = dataFilt,
col.metadata = colData(BRCA.Rnaseq.SE)[,
c("barcode", "ajcc_pathologic_stage", "tissue_type")],
col.colors = list(
tissue_type = c(
"Tumor" = "red",
"Normal" = "green3")
),
sortCol = "tissue_type",
type = "expression", # sets default color
scale = "row", # use z-scores for better visualization. Center gene expression level around 0.
title = "Heatmap from concensus cluster",
filename = "~/Downloads/Heatmap.png",
extremes = seq(-2, 2, 1),
color.levels = colorRampPalette(c("green", "black", "red"))(n = 5),
cluster_rows = TRUE,
cluster_columns = FALSE,
width = 1000,
height = 500
)
差异表达分析
TCGAanalyze_DEA
函数用于执行差异表达分析(DEA
)来识别差异表达基因(DEG
)
差异表达基因识别算法是基于 limma
和 edgeR
两个包的,默认使用的 edgeR
,可以设置 pipeline = "limma"
来使用 limma
主要参数有:
mat1
:第一分组样本的表达数据mat2
:第二分组样本的表达数据Cond1type
:第一分组的标签Cond2type
:第二分组的标签fdr.cut
:FDR
阈值,默认为1
logFC.cut
:logFC
阈值,默认为0
例如,使用上一步生成的表达谱,识别差异表达基因
# 选择正常样本("NT")
samplesNT <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = c("NT")
)
# 选择肿瘤样本("TP")
samplesTP <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = c("TP")
)
# 正常与肿瘤样本的差异表达分析
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[,samplesNT],
mat2 = dataFilt[,samplesTP],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
# 差异基因 DEGs
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
FC_FDR_table_mRNA = dataDEGs,
typeCond1 = "Tumor",
typeCond2 = "Normal",
TableCond1 = dataFilt[,samplesTP],
TableCond2 = dataFilt[,samplesNT]
)
差异基因结果如下
主成分分析
我们可以使用 PCA
来进行降维分析。函数 TCGAvisualize_PCA
将绘制不同组的 PCA
图。
# top200 DEGs
pca <- TCGAvisualize_PCA(
dataFilt = dataFilt,
dataDEGsFiltLevel = dataDEGsFiltLevel,
ntopgenes = 200,
group1 = group1,
group2 = group2
)
使用 STAR counts 数据
对于 STAR counts
数据,其分析方法也是类似的,先检索样本信息,选择正常和癌症样本各 10
个
query <- GDCquery(
project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts"
)
samplesDown <- getResults(query, cols = c("cases"))
dataSmTP <- TCGAquery_SampleTypes(
barcode = samplesDown,
typesample = "TP"
)
dataSmNT <- TCGAquery_SampleTypes(
barcode = samplesDown,
typesample = "NT"
)
dataSmTP_short <- dataSmTP[1:10]
dataSmNT_short <- dataSmNT[1:10]
获取并下载这 20
个样本的表达数据,然后对数据进行处理以及标准化。
query.selected.samples <- GDCquery(
project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts",
barcode = c(dataSmTP_short, dataSmNT_short)
)
GDCdownload(
query = query.selected.samples
)
dataPrep <- GDCprepare(
query = query.selected.samples,
save = TRUE
)
dataPrep <- TCGAanalyze_Preprocessing(
object = dataPrep,
cor.cut = 0.6,
datatype = "unstrand"
)
dataNorm <- TCGAanalyze_Normalization(
tabDF = dataPrep,
geneInfo = geneInfoHT,
method = "gcContent"
)
比较标准化前后数据分布的差异
boxplot(dataPrep, outline = FALSE, names = FALSE)
boxplot(dataNorm, outline = FALSE, names = FALSE)
:::: column
::: column-left
标准化前
:::
::: column-right
标准化后
:::
::::
过滤低表达数据,然后进行差异表达分析
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25
)
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[,dataSmTP_short],
mat2 = dataFilt[,dataSmNT_short],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
绘制火山图
绘制差异基因火山图
TCGAVisualize_volcano(
x = dataDEGs$logFC,
y = dataDEGs$FDR,
filename = "case3/Case3_volcanoexp.png",
x.cut = 3,
y.cut = 10^-5,
names = rownames(dataDEGs),
color = c("black","red","darkgreen"),
names.size = 2,
xlab = " Gene expression fold change (Log2)",
legend = "State",
title = "Volcano plot (CIMP-high vs CIMP-low)",
width = 10
)
miRNA 数据
对于 miRNA
分析,也是类似的。
query.miRNA <- GDCquery(
project = "TCGA-BRCA",
experimental.strategy = "miRNA-Seq",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification"
)
GDCdownload(query = query.miRNA)
dataAssy.miR <- GDCprepare(
query = query.miRNA
)
rownames(dataAssy.miR) <- dataAssy.miR$miRNA_ID
提取出 read_count
数据,并进行差异表达分析
miR <- tibble::column_to_rownames(dataAssy.miR, "miRNA_ID") %>%
dplyr::select(starts_with("read_count")) %>%
rename_with(function(x) str_remove(x, "read_count_"))
dataFilt <- TCGAanalyze_Filtering(
tabDF = miR,
method = "quantile",
qnt.cut = 0.25
)
dataSmTP.miR <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = "TP"
)
dataSmNT.miR <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = "NT"
)
dataMIR <- TCGAanalyze_DEA(
mat1 = dataFilt[,dataSmNT.miR],
mat2 = dataFilt[,dataSmTP.miR],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
查看分析结果
GO 富集分析
在获取到差异表达基因之后,为了更好的理解潜在的生物学过程,通常会对这组基因所集中的功能通路感兴趣,可以通过功能富集来获取潜在的通路
我们可以使用 TCGAanalyze_EAcomplete
函数来进行功能富集,然后使用 TCGAvisualize_EAbarplot
来展示功能富集的结果
例如,我们使用前面获取到的基因列表来进行功能富集
Genelist <- dataDEGs$gene_name
ansEA <- TCGAanalyze_EAcomplete(
TFname = "DEA genes Normal Vs Tumor",
RegulonList = Genelist
)
TCGAvisualize_EAbarplot(
tf = rownames(ansEA$ResBP),
GOBPTab = ansEA$ResBP,
GOCCTab = ansEA$ResCC,
GOMFTab = ansEA$ResMF,
PathTab = ansEA$ResPat,
nRGTab = Genelist,
nBar = 10,
filename = NULL # 设置为 NULL 会返回图像而不是输出到文件中
)
生存分析
TCGAanalyze_survival
函数可以用于绘制生存曲线图,会自动根据列名 days_to_death
和 vital
提取生存信息,并根据 clusterCol
参数传递的列名进行分组。例如
clin.gbm <- GDCquery_clinic("TCGA-GBM", "clinical")
TCGAanalyze_survival(
data = clin.gbm,
clusterCol = "gender",
main = "TCGA Set\n GBM",
height = 10,
width = 10,
filename = NULL
)
基因表达与生存
clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA", "clinical")
dataBRCAcomplete <- as.data.frame(log2(dataPrep[1:20,] + 1))
tabSurvKM <- TCGAanalyze_SurvivalKM(
clinical_patient = clinical_patient_Cancer,
dataGE = dataBRCAcomplete,
Genelist = rownames(dataBRCAcomplete),
Survresult = FALSE,
p.cut = 1,
ThreshTop = 0.67,
ThreshDown = 0.33,
group1 = dataSmNT_short, # Control group
group2 = dataSmTP_short
)
差异甲基化分析
使用 TCGAanalyze_DMC
函数,从两组样本中的 beta
值中寻找差异甲基化 CpG
位点。
- 首先,计算每个探针在两组之间的甲基化均值之差
- 然后,使用
wilcoxon test
来识别两组之间的差异表达,并使用Benjamini-Hochberg
来进行FDR
矫正。 - 在分析完之后,会保存一个火山图,其中
x
轴为平均甲基化差值,y
轴为显著性水平。
TCGAanalyze_DMC
的参数包括
从 TCGA-BRCA
中挑选出 10
个样本,并根据癌和正常样本分组进行差异甲基化分析
# 安装依赖
if (!require("sesameData", quietly = TRUE))
BiocManager::install("sesameData")
if (!require("sesame", quietly = TRUE))
BiocManager::install("sesame")
library(SummarizedExperiment)
library(sesameData)
library(sesame)
# 查询 TCGA-BRCA 项目中既包含甲基化又有表达的样本
samples <- matchedMetExp("TCGA-BRCA", n = 10)
# 查询下载
query <- GDCquery(
project = "TCGA-BRCA",
data.category = "DNA Methylation",
platform = "Illumina Human Methylation 450",
data.type = "Methylation Beta Value",
barcode = samples
)
GDCdownload(query)
met <- GDCprepare(query, save = FALSE)
# 删除 NA 行
met <- subset(met,subset = (rowSums(is.na(assays(met))) == 0))
展示不同分组之间的平均甲基化水平的的箱线图,其中 groupCol
为 colData(met)
中的列名,表示按该列分组。例如,按 PAM50
亚型分组
TCGAvisualize_meanMethylation(met, groupCol = "paper_BRCA_Subtype_PAM50", filename = NULL)
识别差异甲基化区域
met_res <- TCGAanalyze_DMC(
met,
# 使用 sample_type 列来分组
groupCol = "paper_BRCA_Subtype_PAM50",
# 具体的 PAM50 分组名称
group1 = "Basal",
group2 = "LumB",
p.cut = 0.05,
diffmean.cut = 0.15,
save = FALSE,
legend = "State",
plot.filename = "~/Downloads/DMC.png",
cores = 1
)
输出的火山图为
从图中可以看出,并没有找到显著的差异甲基化区域。
整合表达和甲基化
starburst plot
通过结合两个火山图的信息,并应用于 DNA
甲基化和基因表达的研究。它于 2010
年首次提出(Noushmehr
等人)。我们可以使用 TCGAvisualize_starburst
函数。
该函数可比较 DNA
甲基化和基因表达。x
轴为 DNA
甲基化的 log10 FDR
值,y
轴为每个基因的基因表达量。黑色虚线表示 FDR
矫正后的 P
值为 0.01
。
# 这 5 列信息必须添加进去
rowRanges(met)$diffmean.Basal.LumB <- met_res$mean.Basal.minus.mean.LumB
rowRanges(met)$diffmean.LumB.Basal <- -1 * met_res$mean.Basal.minus.mean.LumB
rowRanges(met)$p.value.Basal.LumB <- met_res$p.value.Basal.LumB
rowRanges(met)$p.value.adj.Basal.LumB <- met_res$p.value.adj.Basal.LumB
rowRanges(met)$probeID <- names(met)
starburst <- TCGAvisualize_starburst(
met = met,
exp = dataDEGs,
genome = "hg38",
group1 = "Basal",
group2 = "LumB",
filename = "~/Downloads/starburst.png",
met.platform = "Illumina Human Methylation 450",
met.p.cut = 10^-3,
exp.p.cut = 10^-3,
diffmean.cut = 0.25,
logFC.cut = 3,
names = FALSE,
height = 10,
width = 15,
dpi = 300
)
该函数将绘制一张图片并返回一个矩阵,其中包含基因表达(上调/下调)和甲基化(高/低甲基化)相关的状态。