数据挖掘:是时候更新一下TCGA的数据了

TCGA在去年更新之后提供了Count、TPM、FPKM三种格式的mRNA表达量数据,同时提供了ensembl gene ID、基因名、基因类型,因此有必要更新一下数据了。

 

18169ac8cac94e0aaf891dbabd76b1de.png

 

安装需要的R包

install.packages("tidyverse")

install.packages("arrow")

install.packages("data.table")

install.packages("magrittr")

install.packages("pacman")

if (!requireNamespace("BiocManager", quietly = TRUE)) {

  install.packages("BiocManager")

}

BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data")

BiocManager::install("BioinformaticsFMRP/TCGAbiolinks")

 

TCGA数据版本信息

rm(list = ls())

library(pacman)

p_load(magrittr, tidyverse, TCGAbiolinks, data.table, arrow)

TCGAbiolinks::getGDCInfo()

# $commit

# [1] "4dd3680528a19ed33cfc83c7d049426c97bb903b"

# $data_release

# [1] "Data Release 36.0 - December 12, 2022"

# $status

# [1] "OK"

# $tag

# [1] "3.0.0"

# $version

# [1] 1

 

建几个文件夹

mkdir mRNA miRNA SNV CNV Protein

需要下载的数据

gdc_projects <- TCGAbiolinks::getGDCprojects() %>%

  pull(id) %>%

  grep(pattern = "^TCGA", x = ., value = T) %>%

  str_remove("TCGA-")

gdc_projects

# [1] "CHOL" "LIHC" "DLBC" "BLCA" "ACC" "CESC" "PCPG" "PAAD" "MESO" "TGCT"

# [11] "KIRP" "UVM" "UCS" "THYM" "COAD" "ESCA" "GBM" "KICH" "HNSC" "PRAD"

# [21] "OV" "LUSC" "LAML" "LGG" "SARC" "BRCA" "READ" "LUAD" "STAD" "THCA"

# [31] "KIRC" "SKCM" "UCEC"

下载mRNA表达量数据

downRNA <- function(cancer) {

  query <- TCGAbiolinks::GDCquery(

    project = paste0("TCGA-", cancer),

    data.category = "Transcriptome Profiling",

    data.type = "Gene Expression Quantification",

    workflow.type = "STAR - Counts",

    legacy = FALSE

  )

  TCGAbiolinks::GDCdownload(query, files.per.chunk = 50)

  data <- TCGAbiolinks::GDCprepare(query, summarizedExperiment = F)

  data %<>% dplyr::filter(str_detect(gene_id, "^EN"))

  dt <- data %>% dplyr::select(gene_id, gene_name, gene_type, starts_with("unstranded"), starts_with("tpm"), starts_with("fpkm_unstranded"))

  colnames(dt) %<>% str_remove("_unstranded") %>% str_replace("unstranded", "count")

  arrow::write_ipc_file(dt, str_glue("mRNA/TCGA_{cancer}_mRNA.arrow", compression = "zstd", compression_level = 1))

  return(NULL)

}

walk(gdc_projects, downRNA)

下载其他几种数据的函数

download <- function(

    cancer,

    folder_name,

    data_category = FALSE,

    data_type = FALSE,

    workflow_type = FALSE,

    experimental_strategy = FALSE,

    legacy = FALSE) {

  query <- TCGAbiolinks::GDCquery(

    project = paste0("TCGA-", cancer),

    data.category = data_category,

    data.type = data_type,

    experimental.strategy = experimental_strategy,

    workflow.type = workflow_type,

    legacy = legacy

  )

  TCGAbiolinks::GDCdownload(query, files.per.chunk = 50)

  TCGAbiolinks::GDCprepare(query = query, summarizedExperiment = FALSE) %>%

    arrow::write_ipc_file(., str_glue("{folder_name}/TCGA_{cancer}_{folder_name}.arrow", compression = "zstd", compression_level = 1))

}

下载microRNA表达量数据

walk(gdc_projects, download, folder_name = "miRNA", data_category = "Transcriptome Profiling", data_type = "miRNA Expression Quantification", experimental_strategy = "miRNA-Seq")

 

下载SNV数据

walk(gdc_projects, download, folder_name = "SNV", data_category = "Simple Nucleotide Variation", data_type = "Masked Somatic Mutation")

 

下载CNV 数据

walk(gdc_projects, download, folder_name = "CNV", data_category = "Copy Number Variation", data_type = "Masked Copy Number Segment")

 

下载蛋白表达量数据

walk(gdc_projects, download, folder_name = "Protein", data_category = "Proteome Profiling", data_type = "Protein Expression Quantification")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值