前言
前面,我们介绍了如何获取 TCGA
的各种数据。在获取到数据之后,我们就可以进行数据分析及分析结果的可视化了
TCGAbiolinks
也提供了一些列的函数,通过封装一些常用的算法来简化分析的流程。例如差异基因、富集分析、生存分析等
先导入依赖包
library(TCGAbiolinks)
library(SummarizedExperiment)
library(tidyverse)
数据分析
1. 表达谱处理
- 从
legacy
数据库中获取基因表达谱
# 定义样本列表
listSamples <-
c(
"TCGA-E9-A1NG-11A-52R-A14M-07",
"TCGA-BH-A1FC-11A-32R-A13Q-07",
"TCGA-A7-A13G-11A-51R-A13Q-07",
"TCGA-BH-A0DK-11A-13R-A089-07",
"TCGA-E9-A1RH-11A-34R-A169-07",
"TCGA-BH-A0AU-01A-11R-A12P-07",
"TCGA-C8-A1HJ-01A-11R-A13Q-07",
"TCGA-A7-A13D-01A-13R-A12P-07",
"TCGA-A2-A0CV-01A-31R-A115-07",
"TCGA-AQ-A0Y5-01A-11R-A14M-07"
)
# 查询 Illumina HiSeq 平台的数据
query <- GDCquery(
project = "TCGA-BRCA",
data.category = "Gene expression",
data.type = "Gene expression quantification",
experimental.strategy = "RNA-Seq",
platform = "Illumina HiSeq",
file.type = "results",
barcode = listSamples,
legacy = TRUE
)
# 下载 IlluminaHiSeq_RNASeqV2 平台中对应样本的信息
GDCdownload(query)
# 处理成行尾 geneID 列为样本 (barcode) 的矩阵
BRCARnaseqSE <- GDCprepare(query)
# 获取表达谱
BRCAMatrix <- assay(BRCARnaseqSE,"raw_count")
表达谱像这样子
行名是基因 symbol
与 ID
合并的形式,可以与 rowRanges(BRCARnaseqSE)
获取的基因信息来进行转换
比如,我想将行名转换为 symbol
feature <- rowRanges(BRCARnaseqSE)
rownames(BRCAMatrix) <- feature[rownames(BRCAMatrix), "gene_id"]$gene_id
- 从
harmonized
数据库中获取基因表达谱
get_expression <- function(proj) {
query <- GDCquery(
project = proj,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM"
)
GDCdownload(query)
data <- GDCprepare(query)
# 由于该表达谱的行尾 Ensembl ID,我们要转换为 gene symbol
exp <- assay(data) %>% as.data.frame() %>%
rownames_to_column(var = "Ensembl_ID") %>% {
# 如果一个 Ensembl ID 映射到多个 gene symbols,要将它删除
unimap <- mapIds(
org.Hs.eg.db, keys = .$Ensembl_ID, keytype = "ENSEMBL",
column = "SYMBOL", multiVals = "filter")
filter(., Ensembl_ID %in% names(unimap))
} %>%
# 将 Ensembl ID 对应到基因 symbol
mutate(Symbol = AnnotationDbi::select(
org.Hs.eg.db, keys = .$Ensembl_ID, keytype = "ENSEMBL",
columns = "SYMBOL")$SYMBOL, .before = Ensembl_ID) %>%
# 删除 Ensembl_ID 列和未配对到 symbol 的 NA 行
dplyr::select(!Ensembl_ID) %>%
filter(!is.na(Symbol)) %>%
# 对基因进行分组取平均值
group_by(Symbol) %>%
summarise_all(mean) %>%
column_to_rownames(var = "Symbol")
return(exp)
}
2. 差异表达分析
TCGAanalyze_DEA
函数用于执行差异表达分析(DEA
)来识别差异表达基因(DEG
)
差异表达基因识别算法是基于 limma
和 edgeR
两个包的,默认使用的 edgeR
,可以设置 pipeline = "limma"
来使用 limma
主要参数有:
-
mat1
:第一分组样本的表达数据 -
mat2
:第二分组样本的表达数据 -
Cond1type
:第一分组的标签 -
Cond2type
:第二分组的标签 -
fdr.cut
:FDR
阈值,默认为1
-
logFC.cut
:FC
阈值,默认为0
例如,使用上一步生成的表达谱,识别差异表达基因
# 标准化
dataNorm <- TCGAanalyze_Normalization(
tabDF = BRCAMatrix, geneInfo = TCGAbiolinks::geneInfo
)
# 使用分位数来过滤基因
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataNorm, method = "quantile", qnt.cut = 0.25
)
# 挑选正常样本:NT
samplesNT <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt), typesample = c("NT")
)
# 挑选肿瘤样本:TP
samplesTP <- TCGAquery_SampleTypes(
barcode = colnames(dataFilt),
typesample = c("TP")
)
# 差异表达分析
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[, samplesNT],
mat2 = dataFilt[, samplesTP],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
# 获取差异表达基因的表达水平
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
dataDEGs, "Tumor", "Normal",
dataFilt[, samplesTP], dataFilt[, samplesNT]
)
结果如下
2.1 HTSeq 数据
对于 HTSeq
数据,其分析方法也是类似的,先获取部分样本,选择正常和癌症样本各 10
个
query <- GDCquery(
project = CancerProject,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts"
)
samplesDown <- getResults(query,cols=c("cases"))
dataSmTP <- TCGAquery_SampleTypes(
barcode = samplesDown, typesample = "TP"
)
dataSmNT <- TCGAquery_SampleTypes(
barcode = samplesDown, typesample = "NT"
)
dataSmTP_short <- dataSmTP[1:10]
dataSmNT_short <- dataSmNT[1:10]
获取数据并进行标准化
CancerProject <- "TCGA-BRCA"
DataDirectory <- paste0("~/Downloads/",gsub("-","_",CancerProject))
FileNameData <- paste0(DataDirectory, "_","HTSeq_Counts",".rda")
# 查询对应样本
queryDown <- GDCquery(
project = CancerProject,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode = c(dataSmTP_short, dataSmNT_short)
)
# 下载并保存到指定路径
GDCdownload(query = queryDown, directory = DataDirectory)
# 预处理数据并保存到本地
dataPrep <- GDCprepare(
query = queryDown,
save = TRUE,
directory = DataDirectory,
save.filename = FileNameData
)
# 获取 count 数据
dataPrep <- TCGAanalyze_Preprocessing(
object = dataPrep, cor.cut = 0.6, datatype = "HTSeq - Counts"
)
dataPrep %<>% as.data.frame() %>%
rownames_to_column(var = "Ensembl_ID") %>% {
unimap <- mapIds(
org.Hs.eg.db, keys = .$Ensembl_ID, keytype = "ENSEMBL",
column = "SYMBOL", multiVals = "filter")
filter(., Ensembl_ID %in% names(unimap))
} %>%
mutate(Symbol = AnnotationDbi::select(
org.Hs.eg.db, keys = .$Ensembl_ID, keytype = "ENSEMBL",
columns = "SYMBOL")$SYMBOL, .before = Ensembl_ID) %>%
dplyr::select(!Ensembl_ID) %>%
filter(!is.na(Symbol)) %>%
group_by(Symbol) %>%
summarise_all(mean) %>%
column_to_rownames(var = "Symbol")
# 对数据进行标准化
dataNorm <- TCGAanalyze_Normalization(
tabDF = dataPrep, geneInfo = TCGAbiolinks::geneInfoHT, method = "gcContent"
)
比较标准化前后数据分布的差别
boxplot(dataPrep, outline = FALSE, names = FALSE)
boxplot(dataNorm, outline = FALSE, names = FALSE)
数据过滤和差异表达分析
# 将 Ensembl ID 转换为 symbol
dataNorm %<>% as.data.frame() %>%
rownames_to_column(var = "Ensembl_ID") %>% {
unimap <- mapIds(
org.Hs.eg.db, keys = .$Ensembl_ID, keytype = "ENSEMBL",
column = "SYMBOL", multiVals = "filter")
filter(., Ensembl_ID %in% names(unimap))
} %>%
mutate(Symbol = AnnotationDbi::select(
org.Hs.eg.db, keys = .$Ensembl_ID, keytype = "ENSEMBL",
columns = "SYMBOL")$SYMBOL, .before = Ensembl_ID) %>%
dplyr::select(!Ensembl_ID) %>%
filter(!is.na(Symbol)) %>%
group_by(Symbol) %>%
summarise_all(mean) %>%
column_to_rownames(var = "Symbol")
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataNorm, method = "quantile", qnt.cut = 0.25
)
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[, dataSmTP_short],
mat2 = dataFilt[, dataSmNT_short],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
2.2 miRNA 数据
对于 miRNA
的处理,由于包含多种类型的文件,所以需要设置 file.type
参数,参数的取值可以使用如下方式来确定
query <- GDCquery(
project = CancerProject,
data.category = "Gene expression",
data.type = "miRNA gene quantification",
legacy = TRUE
)
> head(getResults(query)$file_name)
[1] "TCGA-B6-A3ZX-01A-11R-A23A-13.mirna.quantification.txt"
[2] "TCGA-BH-A0H0-01A-11R-A057-13.hg19.mirbase20.mirna.quantification.txt"
[3] "TCGA-EW-A2FS-01A-11R-A17A-13.mirna.quantification.txt"
[4] "TCGA-AC-A3W6-01A-12R-A22I-13.mirna.quantification.txt"
[5] "TCGA-EW-A1P8-01A-11R-A143-13.mirna.quantification.txt"
[6] "TCGA-C8-A8HR-01A-11R-A36A-13.hg19.mirbase20.mirna.quantification.txt"
可以看到,包含两种类型的文件,我们要使用的是 *mirna.quantification.txt
类型的文件
先挑选 20
个样本
CancerProject <- "TCGA-BRCA"
DataDirectory <- paste0("~/Downloads/", gsub("-", "_", CancerProject))
FileNameData <-paste0(DataDirectory, "_", "miRNA_gene_quantification", ".rda")
# 查询对应样本
query.miR <- GDCquery(
project = CancerProject,
data.category = "Gene expression",
data.type = "miRNA gene quantification",
file.type = "mirna",
legacy = TRUE
)
samplesDown.miR <- getResults(query.miR, cols = c("cases"))
dataSmTP.miR <- TCGAquery_SampleTypes(
barcode = samplesDown.miR, typesample = "TP"
)
dataSmNT.miR <- TCGAquery_SampleTypes(
barcode = samplesDown.miR, typesample = "NT"
)
dataSmTP_short.miR <- dataSmTP.miR[1:10]
dataSmNT_short.miR <- dataSmNT.miR[1:10]
下载并预处理数据
queryDown.miR <- GDCquery(
project = CancerProject,
data.category = "Gene expression",
data.type = "miRNA gene quantification",
file.type = "mirna",
legacy = TRUE,
barcode = c(dataSmTP_short.miR, dataSmNT_short.miR)
)
GDCdownload(query = queryDown.miR, directory = DataDirectory)
dataAssy.miR <- GDCprepare(
query = queryDown.miR,
save = TRUE,
save.filename = FileNameData,
summarizedExperiment = TRUE,
directory = DataDirectory
)
提取出 read_count
数据,并进行差异表达分析
dataAssy.miR %<>%
column_to_rownames(var = "miRNA_ID") %>%
dplyr::select(starts_with("read_count_")) %>%
rename_all(function(x) gsub("read_count_", "", x))
dataFilt <- TCGAanalyze_Filtering(
tabDF = dataAssy.miR, method = "quantile",
qnt.cut = 0.25
)
dataDEGs <- TCGAanalyze_DEA(
mat1 = dataFilt[, dataSmNT_short.miR],
mat2 = dataFilt[, dataSmTP_short.miR],
Cond1type = "Normal",
Cond2type = "Tumor",
fdr.cut = 0.01 ,
logFC.cut = 1,
method = "glmLRT"
)
查看结果
3. go 富集分析
在获取到差异表达基因之后,为了更好的理解潜在的生物学过程,通常会对这组基因所集中的功能通路感兴趣,可以通过功能富集来获取潜在的通路
我们可以使用 TCGAanalyze_EAcomplete
函数来进行功能富集,然后使用 TCGAvisualize_EAbarplot
来展示功能富集的结果
例如,我们使用前面获取到的基因列表来进行功能富集
Genelist <- rownames(dataDEGsFiltLevel)
system.time(
ansEA <- TCGAanalyze_EAcomplete(
TFname="DEA genes Normal Vs Tumor",Genelist)
)
TCGAvisualize_EAbarplot(
tf = rownames(ansEA$ResBP),
GOBPTab = ansEA$ResBP,
GOCCTab = ansEA$ResCC,
GOMFTab = ansEA$ResMF,
PathTab = ansEA$ResPat,
nRGTab = Genelist,
nBar = 10,
filename = "~/Downloads/go_enrichment.pdf"
)
4. 生存分析
TCGAanalyze_survival
函数可以用于绘制生存曲线图,会自动根据列名 days_to_death
和 vital
提取生存信息,并根据 clusterCol
参数传递的列名进行分组。例如
clin.brca <- GDCquery_clinic("TCGA-BRCA", "clinical")
TCGAanalyze_survival(
clin.brca, "prior_treatment", main = "TCGA Set\n GBM",
height = 10, width = 10,
filename = "~/Downloads/survival.pdf"
)
5. 差异甲基化分析
使用 TCGAanalyze_DMC
函数,从两组样本中的 beta
值中寻找差异甲基化 CpG
位点。
- 首先,计算每个探针在两组之间的甲基化均值之差
- 然后,使用
wilcoxon test
来识别两组之间的差异表达,并使用Benjamini-Hochberg
来进行FDR
矫正。 - 在分析完之后,会保存一个火山图,其中
x
轴为平均甲基化差值,y
轴为显著性水平。
TCGAanalyze_DMC
的参数包括
从 TCGA-BRCA
中挑选出 10
个样本,并根据癌和正常样本分组进行差异甲基化分析
# 查询 TCGA-BRCA 项目中既包含甲基化又有表达的样本
samples <- matchedMetExp("TCGA-BRCA", n = 10)
# 查询下载
query <- GDCquery(
project = c("TCGA-BRCA"),
data.category = "DNA methylation",
platform = "Illumina Human Methylation 450",
legacy = TRUE,
barcode = samples
)
GDCdownload(query)
met <- GDCprepare(query, save = FALSE)
# 删除 NA 行
met <- subset(met,subset = (rowSums(is.na(assay(met))) == 0))
展示癌与正常之间平均甲基化水平的的箱线图
TCGAvisualize_meanMethylation(met, groupCol = "sample_type", filename = NULL)
识别差异甲基化区域
met_res <- TCGAanalyze_DMC(
met,
# 使用 sample_type 列来分组
groupCol = "sample_type",
# 分组名称
group1 = "Primary Tumor",
group2 = "Solid Tissue Normal",
p.cut = 0.05,
diffmean.cut = 0.15,
save = FALSE,
legend = "State",
plot.filename = "~/Downloads/BRCA.png",
cores = 1
)
输出的火山图为
从图中可以看出,并没有找到显著的差异甲基化区域。