R语言：清洗TCGA数据（标准化，细节看非标准化那篇）

print_icecream

已于 2025-02-23 07:10:06 修改

阅读量245

点赞数 1

文章标签： r语言开发语言

于 2025-02-22 22:05:15 首次发布

本文链接：https://blog.csdn.net/print_icecream/article/details/145801683

版权

安装和加载包

将转变为

创建工作目录，复制这两个文件进去。

#加载包和注释文件
library(tidyverse)#加载包
load("gene_annotation_2022.rda")#导入gene注释文件

#加载TCGA下载的rda文件
load("luad.gdc_2022.rda")#导入文件，rda格式文件也可直接从文件夹双击导入


#和counts基本一模一样
tpms <- expquery2@assays@data@listData[["tpm_unstrand"]]
colnames(tpms) <- expquery2@colData@rownames
rownames(tpms) <- expquery2@rowRanges@ranges@NAMES

tpms <- tpms %>% 
  as.data.frame() %>% 
  rownames_to_column("ENSEMBL") %>% 
  inner_join(gene_annotation_2022,"ENSEMBL") %>% 
  .[!duplicated(.$symbol),]

rownames(tpms) <- NULL
tpms <- tpms %>% column_to_rownames("symbol") 

# 保留mRNA （注：可通过table(tpms$type)查看基因类型）
tpms <- tpms[tpms$type == "protein_coding",]
tpms <- tpms[,-c(1,ncol(tpms))]

# 把TCGA barcode切割为16位字符,并去除重复样本
colnames(tpms) <- substring(colnames(tpms),1,16)
tpms <- tpms[,!duplicated(colnames(tpms))]

# 保留01A  （注：可通过table(substring(colnames(tpms),14,16))查看样本类型）
tpms01A <- tpms[,substring(colnames(tpms),14,16) == c("01A")]

# 保留11A
tpms11A <- tpms[,substring(colnames(tpms),14,16) == c("11A")]


#判断counts和tpms的行列名是否一致
identical(rownames(tpms01A),rownames(tpms11A))
#cbind和rbind 合并 col row
tpms <- cbind(tpms01A,tpms11A)

#保存数据
write.table(tpms01A,"tpms01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A,"tpms11A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms,"tpms.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

Log2化

####tpms_log2####
range(tpms)#查看数据范围
range(tpms01A)
range(tpms11A)

#处理01A
tpms01A_log2 <- log2(tpms01A+1)#log2转换 为什么要加1
range(tpms01A_log2)
#处理11A
tpms11A_log2 <- log2(tpms11A+1)
range(tpms11A_log2)
#处理
tpms_log2 <- log2(tpms+1)
range(tpms_log2)

#保存log2转换后的数据
write.table(tpms_log2,"tpms_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms01A_log2,"tpms01A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A_log2,"tpms11A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#表达谱整理完毕

完整代码

####tpms####
library(tidyverse)#加载包
#和counts基本一模一样
tpms <- expquery2@assays@data@listData[["tpm_unstrand"]]
colnames(tpms) <- expquery2@colData@rownames
rownames(tpms) <- expquery2@rowRanges@ranges@NAMES

tpms <- tpms %>% 
  as.data.frame() %>% 
  rownames_to_column("ENSEMBL") %>% 
  inner_join(gene_annotation_2022,"ENSEMBL") %>% 
  .[!duplicated(.$symbol),]

rownames(tpms) <- NULL
tpms <- tpms %>% column_to_rownames("symbol") 

# 保留mRNA （注：可通过table(tpms$type)查看基因类型）
tpms <- tpms[tpms$type == "protein_coding",]
tpms <- tpms[,-c(1,ncol(tpms))]

# 把TCGA barcode切割为16位字符,并去除重复样本
colnames(tpms) <- substring(colnames(tpms),1,16)
tpms <- tpms[,!duplicated(colnames(tpms))]

# 保留01A  （注：可通过table(substring(colnames(tpms),14,16))查看样本类型）
tpms01A <- tpms[,substring(colnames(tpms),14,16) == c("01A")]

# 保留11A
tpms11A <- tpms[,substring(colnames(tpms),14,16) == c("11A")]


#判断counts和tpms的行列名是否一致
identical(rownames(tpms01A),rownames(tpms11A))
#cbind和rbind 合并 col row
tpms <- cbind(tpms01A,tpms11A)

#保存数据
write.table(tpms01A,"tpms01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A,"tpms11A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms,"tpms.txt",sep = "\t",row.names = T,col.names = NA,quote = F)




####tpms_log2####
range(tpms)#查看数据范围
range(tpms01A)
range(tpms11A)

#处理01A
tpms01A_log2 <- log2(tpms01A+1)#log2转换 为什么要加1
range(tpms01A_log2)
#处理11A
tpms11A_log2 <- log2(tpms11A+1)
range(tpms11A_log2)
#处理
tpms_log2 <- log2(tpms+1)
range(tpms_log2)

#保存log2转换后的数据
write.table(tpms_log2,"tpms_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms01A_log2,"tpms01A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A_log2,"tpms11A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#表达谱整理完毕