安装和加载包
将转变为
创建工作目录,复制这两个文件进去。
#加载包和注释文件
library(tidyverse)#加载包
load("gene_annotation_2022.rda")#导入gene注释文件
#加载TCGA下载的rda文件
load("luad.gdc_2022.rda")#导入文件,rda格式文件也可直接从文件夹双击导入
#和counts基本一模一样
tpms <- expquery2@assays@data@listData[["tpm_unstrand"]]
colnames(tpms) <- expquery2@colData@rownames
rownames(tpms) <- expquery2@rowRanges@ranges@NAMES
tpms <- tpms %>%
as.data.frame() %>%
rownames_to_column("ENSEMBL") %>%
inner_join(gene_annotation_2022,"ENSEMBL") %>%
.[!duplicated(.$symbol),]
rownames(tpms) <- NULL
tpms <- tpms %>% column_to_rownames("symbol")
# 保留mRNA (注:可通过table(tpms$type)查看基因类型)
tpms <- tpms[tpms$type == "protein_coding",]
tpms <- tpms[,-c(1,ncol(tpms))]
# 把TCGA barcode切割为16位字符,并去除重复样本
colnames(tpms) <- substring(colnames(tpms),1,16)
tpms <- tpms[,!duplicated(colnames(tpms))]
# 保留01A (注:可通过table(substring(colnames(tpms),14,16))查看样本类型)
tpms01A <- tpms[,substring(colnames(tpms),14,16) == c("01A")]
# 保留11A
tpms11A <- tpms[,substring(colnames(tpms),14,16) == c("11A")]
#判断counts和tpms的行列名是否一致
identical(rownames(tpms01A),rownames(tpms11A))
#cbind和rbind 合并 col row
tpms <- cbind(tpms01A,tpms11A)
#保存数据
write.table(tpms01A,"tpms01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A,"tpms11A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms,"tpms.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
Log2化
####tpms_log2####
range(tpms)#查看数据范围
range(tpms01A)
range(tpms11A)
#处理01A
tpms01A_log2 <- log2(tpms01A+1)#log2转换 为什么要加1
range(tpms01A_log2)
#处理11A
tpms11A_log2 <- log2(tpms11A+1)
range(tpms11A_log2)
#处理
tpms_log2 <- log2(tpms+1)
range(tpms_log2)
#保存log2转换后的数据
write.table(tpms_log2,"tpms_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms01A_log2,"tpms01A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A_log2,"tpms11A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#表达谱整理完毕
完整代码
####tpms####
library(tidyverse)#加载包
#和counts基本一模一样
tpms <- expquery2@assays@data@listData[["tpm_unstrand"]]
colnames(tpms) <- expquery2@colData@rownames
rownames(tpms) <- expquery2@rowRanges@ranges@NAMES
tpms <- tpms %>%
as.data.frame() %>%
rownames_to_column("ENSEMBL") %>%
inner_join(gene_annotation_2022,"ENSEMBL") %>%
.[!duplicated(.$symbol),]
rownames(tpms) <- NULL
tpms <- tpms %>% column_to_rownames("symbol")
# 保留mRNA (注:可通过table(tpms$type)查看基因类型)
tpms <- tpms[tpms$type == "protein_coding",]
tpms <- tpms[,-c(1,ncol(tpms))]
# 把TCGA barcode切割为16位字符,并去除重复样本
colnames(tpms) <- substring(colnames(tpms),1,16)
tpms <- tpms[,!duplicated(colnames(tpms))]
# 保留01A (注:可通过table(substring(colnames(tpms),14,16))查看样本类型)
tpms01A <- tpms[,substring(colnames(tpms),14,16) == c("01A")]
# 保留11A
tpms11A <- tpms[,substring(colnames(tpms),14,16) == c("11A")]
#判断counts和tpms的行列名是否一致
identical(rownames(tpms01A),rownames(tpms11A))
#cbind和rbind 合并 col row
tpms <- cbind(tpms01A,tpms11A)
#保存数据
write.table(tpms01A,"tpms01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A,"tpms11A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms,"tpms.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
####tpms_log2####
range(tpms)#查看数据范围
range(tpms01A)
range(tpms11A)
#处理01A
tpms01A_log2 <- log2(tpms01A+1)#log2转换 为什么要加1
range(tpms01A_log2)
#处理11A
tpms11A_log2 <- log2(tpms11A+1)
range(tpms11A_log2)
#处理
tpms_log2 <- log2(tpms+1)
range(tpms_log2)
#保存log2转换后的数据
write.table(tpms_log2,"tpms_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms01A_log2,"tpms01A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
write.table(tpms11A_log2,"tpms11A_log2.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#表达谱整理完毕