TCGAbiolinks整理表达数据和临床数据

sayhello1025

已于 2023-07-27 11:21:21 修改

阅读量8.3k

点赞数 26

分类专栏： TCGA 文章标签： r语言开发语言

于 2022-06-10 11:37:14 首次发布

本文链接：https://blog.csdn.net/sayhello1025/article/details/125218337

版权

TCGA 专栏收录该内容

8 篇文章

订阅专栏

新版TCGAbiolinks的整理表达数据和临床数据

没有废话直接干

##加载包
rm(list = ls())
options(stringsAsFactors = F)
gc()
library(TCGAbiolinks)
library(scRNAseq)
library(data.table)
library(limma)
library(dplyr)
library(DT)

表达数据下载，跟GDC官网一样的参数

?GDCquery ##好好看参数，真的很简单对着选就可以
###表达数据下载
query <- GDCquery(
  project = "TCGA-ESCA",
  data.category = "Transcriptome Profiling",
  data.type = "Gene Expression Quantification", 
  workflow.type = "STAR - Counts"
)
GDCdownload(query = query)
expData<- GDCprepare(query = query,
                   save = TRUE,
                   save.filename = 'ESCA_exp.rda'
)
##利用scRNAseq这个包直接获取
tpm_data <- assay(expData,i = "tpm_unstrand")##选择的比较多根据下面图的列名选取需要的数据

在这里插入图片描述

我们得到标准化的TPM矩阵

在这里插入图片描述

行名转换

目录下会生成一个GDCdata的文件夹，需要你选取里面任意的一个表达文件，格式为tsv的哈

#随便找个表达文件
row_file <-  data.table::fread('./GDCdata/TCGA-ESCA/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/00373cba-948b-4fb5-a8ea-8aa612f4ea99/625933e1-f9c9-43d9-8335-9de0c7ecb367.rna_seq.augmented_star_gene_counts.tsv',data.table = F)
row_file <- row_file[-c(1:4),]
rownames(row_file) <- row_file[,1]
###
same <- intersect(row.names(tpm_data),row.names(row_file))
length(same)
ESCA_tpmExp <- cbind(row_file[same,],tpm_data[same,])
ESCA_tpmExp <- ESCA_tpmExp[,-c(1,3:9)]
dim(ESCA_tpmExp)
#60660   175 ##含有基因的名字所以175
##去重
rt=as.matrix(ESCA_tpmExp)
rownames(rt)=rt[,1]
exp=rt[,2:ncol(rt)]
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)
ESCA_exp=data[rowMeans(data)>0,]
dim(ESCA_exp)
#56911   174
Out=rbind(id=colnames(ESCA_exp), ESCA_exp)
write.table(Out, file="./00.data/ESCA_exp.txt", sep="\t", quote=F, col.names=F)

接下来下载临床数据

解释下这个参数，需要你选择下需要的信息

cli <- GDCprepare_clinic(query,'follow_up')

在这里插入图片描述

rm(list = ls())
options(stringsAsFactors = F)
gc()
###临床数据下载
query <- GDCquery(
  project = "TCGA-ESCA", 
  data.category = "Clinical",
  data.type = "Clinical Supplement", 
  data.format = "BCR XML"
)
GDCdownload(query)
cli <- GDCprepare_clinic(query,'follow_up')

接着把生存时间给合并下

2个人没有生存状态信息

#合并时间
cli <- cli  %>%
  select(bcr_followup_barcode,vital_status,                            
                days_to_death,days_to_last_followup) %>%
  distinct(bcr_followup_barcode, .keep_all = TRUE)
table(cli$vital_status)
# NA  Alive  Dead 
#  2   121    32 
##死亡的信息
dead_patient <-  cli %>%
  dplyr::filter(vital_status == 'Dead') %>%
  dplyr::select(-days_to_last_followup) %>%
    rename(c(bcr_followup_barcode = 'Barcode',
                    vital_status = 'fustat',
                    days_to_death='futime'
                    )) %>%
  mutate(fustat=ifelse(fustat=='Dead',1,0))%>%
  mutate(futime=futime/365) 
#活的信息
alive_patient <-  cli %>%
  dplyr::filter(vital_status == 'Alive') %>%
  dplyr::select(-days_to_death) %>%
  rename(c(bcr_followup_barcode = 'Barcode',
           vital_status = 'fustat',
           days_to_last_followup='futime'
  )) %>%
  mutate(fustat=ifelse(fustat=='Dead',1,0))%>%
  mutate(futime=futime/365) 
#合并
survival_data <- rbind(dead_patient,alive_patient)
write.csv(survival_data,file="./00.data/ESCA_surviv.csv")

接着把生存时间跟表达数据合并下

#第一步导入你的表达矩阵
exp <- read.csv(yourfile，header = T,row.names = 1)##替换yourfile
exp <- data.frame(t(exp))
rownames(exp) <- gsub('\\.','\\-',rownames(exp))
cli <- read.csv(yourcli,header = T,row.names = 1)##替换yourcli
exp$ID1 <- substr(rownames(exp), 1, 12)
cli$ID2 <- substr(rownames(cli), 1, 12)
map <- match(exp$ID,cli$ID)##前后顺序很很关键，想要的放第一个
data3 <- cli[map,]##注意看这个
data4 <- cbind(exp,data3)
data5 <- na.omit(data4)
desired_columns <- c("fustat","futime",'ID1','ID2')
all_columns <- c(desired_columns, setdiff(names(data5), desired_columns))
data6 <- data5[all_columns]