library("TCGAbiolinks")
###getGDCprojects()$project_id ##罗列project参数所有内容(癌症种类)
###TCGAbiolinks:::getProjectSummary("TCGA-HNSC") ##罗列所选癌症的data.category
expquery <- GDCquery(project = "TCGA-HNSC",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts"
)
GDCdownload(expquery)
expquery2 <- GDCprepare(expquery)
expMatrix <- TCGAanalyze_Preprocessing(expquery2)
##Linux版R语言
##write.table(expMatrix, "../myresult_txt/HNSC_gene_counts_file00.tsv", sep="\t", quote=F, row.names=T)
##Win版R语言
##write.table(expMatrix, "..\\myresult_txt\\HNSC_gene_counts_file00.tsv", sep="\t", quote=F, row.names=T)
library("TCGAbiolinks")
query_clin <- GDCquery_clinic(project = "TCGA-HNSC",
type = "clinical",
save.csv = "FALSE")
GDCdownload(query_clin)
##Linux版R语言
##write.table(query_clin, "../myresult_txt/HNSC_clinical_file00t.tsv", sep="\t", quote=F, col.name = T, row.names=F)
##Win版R语言
write.table(query_clin, "..\\myresult_txt\\HNSC_clinical_file00t.tsv", sep="\t", quote=F, col.name = T, row.names=F)
学习生信,先下载R和Rstudio软件
第一篇参考链接:TCGA数据下载—TCGAbiolinks包参数详解 - 组学大讲堂问答社区 (omicsclass.com)
再安装R里和生信有关的包:
local({r <- getOption("repos")
r["CRAN"] <- "http://mirrors.tuna.tsinghua.edu.cn/CRAN/"
options(repos=r)})
if (!requireNamespace("BiocManager", quietly=TRUE)){
install.packages("BiocManager")
}
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)
1.首先我们需要知道总的数据框架,先输入以下代码,获取GDC数据库详细资料,可在Environment > data查看:
getGDCprojects()
2.获取最新不同癌种ID(知道他们的ID才方便后面下载它所包含的数据):
getGDCprojects()$project_id
[1] "TCGA-BRCA" "GENIE-MSK"
[3] "GENIE-VICC" "GENIE-UHN"
[5] "CPTAC-2" "CMI-ASC"
[7] "BEATAML1.0-COHORT" "CGCI-BLGSP"
[9] "BEATAML1.0-CRENOLANIB" "CMI-MPC"
[11] "CMI-MBC" "GENIE-GRCC"
[13] "GENIE-MDA" "GENIE-JHU"
[15] "GENIE-NKI" "FM-AD"
[17] "VAREPOP-APOLLO" "WCDT-MCRPC"
[19] "GENIE-DFCI" "TARGET-ALL-P3"
[21] "TARGET-ALL-P2" "OHSU-CNL"
[23] "TARGET-ALL-P1" "MMRF-COMMPASS"
[25] "TARGET-CCSK" "ORGANOID-PANCREATIC"
[27] "NCICCR-DLBCL" "TARGET-NBL"
[29] "TARGET-OS" "TARGET-RT"
[31] "TARGET-WT" "TCGA-LAML"
[33] "CGCI-HTMCP-CC" "TARGET-AML"
[35] "HCMI-CMDC" "TCGA-DLBC"
[37] "TCGA-CHOL" "CTSP-DLBCL1"
[39] "TRIO-CRU" "TCGA-MESO"
[41] "TCGA-ACC" "TCGA-UCS"
[43] "TCGA-KICH" "TCGA-PCPG"
[45] "TCGA-ESCA" "TCGA-THYM"
[47] "TCGA-TGCT" "TCGA-UVM"
[49] "TCGA-CESC" "TCGA-BLCA"
[51] "TCGA-PAAD" "TCGA-LIHC"
[53] "TCGA-SKCM" "TCGA-UCEC"
[55] "TCGA-PRAD" "REBC-THYR"
[57] "TCGA-THCA" "TCGA-OV"
[59] "TCGA-LGG" "TCGA-SARC"
[61] "CPTAC-3" "TCGA-COAD"
[63] "TCGA-READ" "TCGA-KIRP"
[65] "TCGA-GBM" "TCGA-STAD"
[67] "TCGA-LUAD" "TCGA-KIRC"
[69] "TCGA-LUSC" "TCGA-HNSC"
3.查看某个癌种(如头颈鳞状细胞癌)的数据类型(data.category)
TCGAbiolinks:::getProjectSummary("TCGA-HNSC")
$file_count
[1] 16694$data_categories
file_count case_count data_category
1 3227 526 Copy Number Variation
2 573 528 Clinical
3 354 354 Proteome Profiling
4 2776 528 Transcriptome Profiling
5 4104 510 Simple Nucleotide Variation
6 2858 528 Biospecimen
7 580 528 DNA Methylation
8 2222 528 Sequencing Reads$case_count
[1] 528$file_size
[1] 1.832829e+