Bioconductor的TCGAbiolinks包用于GDC数据综合分析的R/Bioconductor软件包,本文主要展示下载数据集和代码。
1. 包的加载
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)
ls("package:TCGAbiolinks")
# 查看有哪些projects
getGDCprojects()$project_id
# 查看某一个project含有哪些数据
project <- 'TCGA-COAD'
#project <- 'TCGA-READ'
TCGAbiolinks:::getProjectSummary(project)
# data.type
#下载rna-seq的counts数据
data.type = "Gene Expression Quantification"
#下载miRNA数据
data.type = "miRNA Expression Quantification"
#下载Copy Number Variation数据
data.type = "Copy Number Segment"
#workflow.type 有三种类型分别为:
#HTSeq - FPKM-UQ:FPKM上四分位数标准化值
#HTSeq - FPKM:FPKM值/表达量值
#HTSeq - Counts:原始count数
#legacy
#这个参数主要是因为TCGA数据有两个入口可以下载,
#GDC Legacy Archive 和 GDC Data Portal,
#区别主要是注释参考基因组版本不同分别是:
#GDC Legacy Archive(hg19和GDC Data Portal(hg38)。
#参数默认为FALSE,下载GDC Data Portal(hg38)。
#这里建议是,下载转录组层面的数据使用hg38,
#下载DNA层面的数据使用hg19,
#因为比如做SNP分析的时候很多数据库没有hg38版本的数据,都是hg19的。
更多data.category,data.type,workflow.type的取值可以参考https://www.bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/query.html#Harmonized_data_options_(legacy_=_FALSE)
2. 临床特征数据的下载
#GDCquery_clinic()
# 下载患者临床数据。 参数save.csv = TRUE,下载到工作目录下
clin <- GDCquery_clinic(project, type = "clinical")
# 下载样本信息,一个患者有多个临床样本。
sample_info <- GDCquery_clinic(project, type = "biospecimen")
3. 下载基因表达谱数据
# 下载rna_seq数据
query <- GDCquery(project = project,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
GDCdownload(query, files.per.chunk = 200)
expdat <- GDCprepare(query= query)
count_matrix <- assay(expdat)
write.csv(count_matrix,file=paste(project,"counts.csv",sep="_"))
4. 下载甲基化数据
# 下载甲基化数据
query2 <- GDCquery(project = project,
data.category = "DNA methylation",
legacy=TRUE,
platform = "Illumina Human Methylation 27")
GDCdownload(query2, files.per.chunk = 300)
expdat2 <- GDCprepare(query= query2)
count_matrix2 <- assay(expdat2)
write.csv(count_matrix2,file=paste(project,"methylation.csv",sep="_"))
5. 下载miRNA数据
# 下载miRNA数据
query3 <- GDCquery(project = project,
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
workflow.type = "BCGSC miRNA Profiling")
GDCdownload(query3, method = "api", files.per.chunk = 300)
expdat3 <- GDCprepare(query= query3) # data.frame
write.csv(expdat3,file=paste(project,"miRNA.csv",sep="_"))
6. 下载cnv数据
# 下载cnv数据
query4 <- GDCquery(project = project,
data.category = "Copy Number Variation",
data.type="Masked Copy Number Segment")
GDCdownload(query4, files.per.chunk = 300)
expdat4 <- GDCprepare(query= query4) #"spec_tbl_df" "tbl_df" "tbl" "data.frame"
write.csv(expdat4,file=paste(project,"cnv.csv",sep="_"))
7.下载基因组突变数据
Usage
GDCquery_Maf(tumor, save.csv = FALSE, directory = "GDCdata", pipelines = NULL)
getGDCprojects()$project_id
# get the mutation data
maf <- GDCquery_Maf("CHOL", pipelines = "muse")
dim(maf)
## 生存MAF对象,以便下游分析
library(maftools)
# write.csv(as.data.frame(maf),"test_maf.csv",row.names = FALSE)
# mock
# maf_clin <- data.frame('Tumor_Sample_Barcode'= maf2$Tumor_Sample_Barcode,
'class'=c(rep("unknow",length(maf2$Tumor_Sample_Barcode))))
# write.csv(as.data.frame(maf_clin),"test_maf_clin.csv",row.names = FALSE)
# maf_obj <- read.maf(maf = as.data.frame(maf),
clinicalData = as.data.frame(maf_clin))
更方便地用浏览器下载癌症多组学数据:https://ucsc-xena.gitbook.io/project/public-data-we-host/tcga
参考
https://rdrr.io/bioc/TCGAbiolinks/
https://www.bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/index.html
https://www.cancer.gov/about-nci/organization/ccg/research/structural-genomics/tcga