R语言:清洗TCGA数据(标准化,细节看非标准化那篇)

安装和加载包

转变为

 创建工作目录,复制这两个文件进去。

#加载包和注释文件
library(tidyverse)#加载包
load("gene_annotation_2022.rda")#导入gene注释文件

#加载TCGA下载的rda文件
load("luad.gdc_2022.rda")#导入文件,rda格式文件也可直接从文件夹双击导入


#和counts基本一模一样
tpms <- expquery2@assays@data@listData[["tpm_unstrand"]]
colnames(tpms) <- expquery2@colData@rownames
rownames(tpms) <- expquery2@rowRanges@ranges@NAMES

tpms <- tpms %>% 
  as.data.frame() %>% 
  rownames_to_column("ENSEMBL") %>% 
  inner_join(gene_annotation_2022,"ENSEMBL") %>% 
  .[!duplicated(.$symbol),]

rownames(tpms) <- NULL
tpms <- tpms %>% column_to_rownames("symbol") 

# 保留mRNA (注:可通过table(tpms$type)查看基因类型)
tpms <- tpms[tpms$type == "protein_coding",]
tpms <- tpms[,-c(1,ncol(tpms))]

# 把TCGA barcode切割为16位字符,并去除重复样本
colnames(tpms) <- substring(colnames(tpms),1,16)
tpms <- tpms[,!duplicated(colnames(tpms))]

# 保留01A  (注:可通过table(substring(colnames(tpms),14,16))查看样本类型)
tpms01A <- tpms[,substring(colnames(tpms),14,16) == c("01A")]

# 保留11A
tpms11A <- tpms[,substring(colnames(tpms),14,16) == c("11A")]


#判断counts和tpms的行列名是否一致
identical(rownames(tpms01A),rownames(tpms11A))
#cbind和rbind 合并 col row
tpms <- cbind(tpms01A,tpms11A)

#保存数据
write.table(tpms01A,"tpms01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A,"tpms11A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms,"tpms.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

Log2化

####tpms_log2####
range(tpms)#查看数据范围
range(tpms01A)
range(tpms11A)

#处理01A
tpms01A_log2 <- log2(tpms01A+1)#log2转换 为什么要加1
range(tpms01A_log2)
#处理11A
tpms11A_log2 <- log2(tpms11A+1)
range(tpms11A_log2)
#处理
tpms_log2 <- log2(tpms+1)
range(tpms_log2)

#保存log2转换后的数据
write.table(tpms_log2,"tpms_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms01A_log2,"tpms01A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A_log2,"tpms11A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#表达谱整理完毕

完整代码

####tpms####
library(tidyverse)#加载包
#和counts基本一模一样
tpms <- expquery2@assays@data@listData[["tpm_unstrand"]]
colnames(tpms) <- expquery2@colData@rownames
rownames(tpms) <- expquery2@rowRanges@ranges@NAMES

tpms <- tpms %>% 
  as.data.frame() %>% 
  rownames_to_column("ENSEMBL") %>% 
  inner_join(gene_annotation_2022,"ENSEMBL") %>% 
  .[!duplicated(.$symbol),]

rownames(tpms) <- NULL
tpms <- tpms %>% column_to_rownames("symbol") 

# 保留mRNA (注:可通过table(tpms$type)查看基因类型)
tpms <- tpms[tpms$type == "protein_coding",]
tpms <- tpms[,-c(1,ncol(tpms))]

# 把TCGA barcode切割为16位字符,并去除重复样本
colnames(tpms) <- substring(colnames(tpms),1,16)
tpms <- tpms[,!duplicated(colnames(tpms))]

# 保留01A  (注:可通过table(substring(colnames(tpms),14,16))查看样本类型)
tpms01A <- tpms[,substring(colnames(tpms),14,16) == c("01A")]

# 保留11A
tpms11A <- tpms[,substring(colnames(tpms),14,16) == c("11A")]


#判断counts和tpms的行列名是否一致
identical(rownames(tpms01A),rownames(tpms11A))
#cbind和rbind 合并 col row
tpms <- cbind(tpms01A,tpms11A)

#保存数据
write.table(tpms01A,"tpms01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A,"tpms11A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms,"tpms.txt",sep = "\t",row.names = T,col.names = NA,quote = F)




####tpms_log2####
range(tpms)#查看数据范围
range(tpms01A)
range(tpms11A)

#处理01A
tpms01A_log2 <- log2(tpms01A+1)#log2转换 为什么要加1
range(tpms01A_log2)
#处理11A
tpms11A_log2 <- log2(tpms11A+1)
range(tpms11A_log2)
#处理
tpms_log2 <- log2(tpms+1)
range(tpms_log2)

#保存log2转换后的数据
write.table(tpms_log2,"tpms_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms01A_log2,"tpms01A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A_log2,"tpms11A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#表达谱整理完毕

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值