一、下载临床数据(LIHC为例)
GDC官网:https://portal.gdc.cancer.gov/
新版TCGA数据库下载流程:
Cohort Builder → Program(TCGA)、 Project(LIHC)→ 点击Repository → 选择clinical,全部加入Cart → 下载Clinical:TSV 和 Metadata 文件。
二、临床数据整理
数据整理和清洗的代码如下:
remove(list = ls()) ##清空当前环境
#先把下载的临床数据文件解压
tar_file <- "D:/R/TCGA-LIHC/clinical.cart.2024-08-21.tar.gz"# 导入tar.gz文件
extract_dir <- "D:/R/TCGA-LIHC/clinical_data" #解压后存放位置
untar(tar_file, exdir = extract_dir) #解压tar.gz文件
# 提取临床数据整理生存分析需要的数据
setwd("D:/R/TCGA-LIHC/clinical_data") ##设置路径
library(readr)
library(dplyr)
#将下载好的metadata.json文件放入clinical文件夹
# install.packages("jsonlite")
# library(jsonlite)
json <- jsonlite::fromJSON("metadata.cart.2024-08-21.json") #读取JSON文件
entity_submitter_id <- sapply(json$associated_entities, function(x) unlist(x[, 1]))
case_id <- sapply(json$associated_entities, function(x) unlist(x[, 3]))
sample_case <- t(rbind(entity_submitter_id, case_id))
clinical <- read_tsv('clinical.tsv') #读取tsv文件
clinical <- as.data.frame(clinical[!duplicated(clinical$case_id),]) #去除重复的sample
str(sample_case) # 查看sample_case的结构
str(clinical) # 查看clinical的结构
sample_case <- as.data.frame(sample_case)
## 将sample_case$case_id和clinical$case_id转化为字符串格式,便于后续操作。
sample_case$case_id <- as.character(sample_case$case_id)
clinical$case_id <- as.character(clinical$case_id)
matrix <- merge(sample_case,clinical,by="case_id",all.x=T)
colnames(clinical)
demo <- c("case_submitter_id","age_at_index","ethnicity","gender","race",
"vital_status","days_to_death","days_to_last_follow_up",
"ajcc_pathologic_stage","ajcc_pathologic_t","ajcc_pathologic_m",
"ajcc_pathologic_n","treatment_type")
matrix = matrix[,demo] #筛选需要的临床信息
head(matrix)
colnames(matrix) <- c("ID","Age","Ethnicity","Gender","Race",
"Status","days_to_death","days_to_last_follow_up",
"Stage","T","M","N","Treatment")
#排除结局为"Not Reported"的Sample,保留Alive和Dead的数据
matrix = matrix[matrix$Status %in% c('Alive','Dead'),]
# 把matrix数值列转换为数值型,便于记录生存信息
matrix$days_to_last_follow_up <- as.numeric(matrix$days_to_last_follow_up)
matrix$days_to_death <- as.numeric(matrix$days_to_death)
matrix$Age <- as.numeric(matrix$Age)
# 去除NA,替换为0
matrix$days_to_last_follow_up[is.na(matrix$days_to_last_follow_up)] = 0
matrix$days_to_death[is.na(matrix$days_to_death)] = 0
matrix$Age [is.na(matrix$Age )] = 0
matrix$days <- ifelse(matrix$Status=='Alive',matrix$days_to_last_follow_up,matrix$days_to_death)
## 添加生存分析需要的信息:存活状态、月、年
matrix$OS <- ifelse(matrix$Status == "Alive", 0, 1)
matrix$month=round(matrix$days/30,0) #以month为单位,小数不保留
matrix$OS.time <- floor(matrix$month/12)
运行代码,查看matrix存储的临床数据信息:
后续可以进一步进行生存分析及可视化等操作。