STAR, stringtie等上游通过tximport得到表达矩阵_stringtie表达矩阵合成-CSDN博客

本文链接：https://blog.csdn.net/ymy2019/article/details/135405564

本文介绍了一个R脚本，使用argparser处理参数，调用tximport库来合并和导入不同类型的转录组测序数据（如kallisto和stringtie），并对数据进行预处理和过滤，最后生成计数、FPKM和TPM格式的文件输出。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

在这里插入图片描述

library(dplyr)
library(argparser)
library(tximport)

parameter input

parser <- arg_parser("merge transcript abundance files") %>%
add_argument("-s","--sample",help = "the sampleID files") %>%
add_argument("-d","--dir",help = "the dir of output") %>%
add_argument("-g", "--gene",help = "transcriptID into geneID") %>%
add_argument("-t","--type",help = "the type of software") %>%
add_argument("-c","--count",help = "the method of scale for featurecounts") %>%
add_argument("-n", "--name",help = "the name of transcript file") %>%
add_argument("-o","--out",help = "the merged files's dir and name")
args <- parse_args(parser)

prepare for function sample <- args$s

dir <- args$d 
tx2gen <- args$g
type < args$t 
scount <- args$c
name <args$n 
prefix <- args$o

tx2gene_table <- read.csv(tx2gen, header=T)
tx2gene <- tx2gene_table%>%dplyr::select(tx_id,gene_id, gene_name)

phen <- read.table(sample, header=T，sep="\t")
sample_name <- unique(as.character(phen$SampleID))
files <- file.path(dir,sample_name, name)
names(files) <- sample_name
result <- tximport(kal_dirs,type = "kallisto",countsFromAbundance = "no", tx2gene = tx2gene, ignoreTxVersion = T)
save(result, file = prefix)

加载tximport进行不同样本结果导入

library(tximport)
base_dir <- "Downloads/kallisto_quant"

base_dir <- "/share/home/jianglab/yuanmengyi/MI_RNA/kallisto_quant"

base_dir2<-"/share/home/jianglab/yuanmengyi/MI_RNA/"

sample_id <- dir(file.path(base_dir))
sample_id<-paste0(sample_id,"_ballgown_out_dir")
kal_dirs <- sapply(sample_id, function(id) file.path(base_dir, id,"abundance.h5"))

stringtie_dirs <- sapply(sample_id, function(id) file.path(base_dir2, id,"t_data.ctab"))

s2c <- read.table("Downloads/design_matrix.txt", header = TRUE, sep='\t',stringsAsFactors=FALSE)

s2c <- read.table("./design_matrix.txt", header = TRUE, sep='\t',stringsAsFactors=FALSE)

s2c <- dplyr::mutate(s2c, path = kal_dirs)

s2c <- dplyr::mutate(s2c, path = stringtie_dirs)

kallisto_tsv <- tximport(kal_dirs,type = "kallisto",countsFromAbundance = "no", tx2gene = t2g, ignoreTxVersion = T)

stringtie_tsv <- tximport(stringtie_dir,type = "",countsFromAbundance = "no", tx2gene = t2g, ignoreTxVersion = T)

##stringtie_tsv <- tximport(stringtie_dirs, type = "stringtie", tx2gene = t2g)

加载ensembl数据库中的transcript2gene.txt，这里可以不需要链接，我已经下载好。

t2g <- read_csv("Downloads/Ensembl_trans2gene.csv")
t2g <- read_csv("./Ensembl_trans2gene.csv")
t2g<- t2g[,-1]
library(biomaRt)
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "mmusculus_gene_ensembl",
host = 'ensembl.org')

t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id",
"external_gene_name"), mart = mart)

t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id, ext_gene = external_gene_name)
t2g <- read_csv("Downloads/Ensembl_trans2gene.csv")
t2g<-t2g[,-1]

colnames(t2g)<-c("tx_id","gene_id","gene_name")

读入phen信息，并对读入后的sampleID,做到与counts矩阵的列名一一对应的更改。

library(dplyr)
library(tibble)

result<-kallisto_tsv

phen <- read_csv("Downloads/sample_RNA.csv")

phen <- read_csv("./sample_RNA.csv")

phen$SampleID<-gsub("-",".",phen$SampleID)
phen$SampleID<-paste0("X",phen$SampleID)

提取counts信息

kallisto_count<-round(kallisto_tsv$counts) %>% data.frame()

stringtie_count<-round(stringtie_tsv$counts) %>% data.frame()

kallisto_length<-apply(kallisto_tsv$length,1,mean) %>% data.frame() %>% setNames("Length")

stringtie_length<-apply(stringtie_tsv$length,1,mean) %>% data.frame() %>% setNames("Length")

这里我们跳出函数循环外来改动和赋值。

occurrence=0.2
ncount=10
exon_length<-stringtie_length
matrix<-stringtie_count
y<-phen

exon_length<-kallisto_length

函数一，过滤表达矩阵：使用 apply 函数计算每行非零元素的比例，筛选出满足比例大于 occurrence 的行。

使用 rowSums 函数计算每行的和，筛选出和大于 ncount 的行。
使用 intersect 函数找到基因表达矩阵中的样本ID和给定的表型数据框中的样本ID的交集。根据交集得到的样本ID，重新排列表型数据框和基因表达矩阵。
使用 FPKM（每千万读数）和 TPM（每百万读数）规范化基因表达矩阵
对规范化后的基因表达矩阵按照行名排序，最终返回一个包含 counts、FPKM 和 TPM 的列表。
这个函数的输入包括 matrix（基因表达矩阵）、exon_length（外显子长度信息）、
phen（表型数据框）、occurrence（用于过滤的阈值）和 ncount（用于过滤的阈值）。

函数的输出是一个包含 counts、FPKM 和 TPM 的列表。

get_profile<-function(matrix,exon_length,
y=phen,
occurrence=0.2,
ncount=10){

matrix=count_format

exon_length=count_length

occurrence=0.2

ncount=10

filter with occurrence and ncount

prf <- matrix %>% rownames_to_column("Type") %>%
filter(apply(dplyr::select(.,-one_of("Type")),1,
function(x){sum(x >0)/length(x)}) > occurrence) %>%
data.frame(.) %>%
column_to_rownames("Type")
colnames(prf)<-gsub("_ballgown_out_dir","",colnames(prf))
prf <- prf[rowSums(prf) > ncount,]

change sampleID

sid <- intersect(colnames(prf),y$SampleID) 
  phen.cln <- y %>% filter(SampleID%in%sid) %>% arrange(SampleID)
  prf.cln <- prf %>% dplyr::select(as.character(phen.cln$SampleID))

determine the right order between profile and phenotype

for(i in 1:ncol(prf.cln)){
if(!(colnames(prf.cln)[i] == phen.cln$SampleID[i])){ 
      stop(paste0(i," Wrong"))
    }
  }
  colnames(prf.cln) <- phen.cln$SampleID

normalizate the profile using FPKM and TPM

exon_length.cln <- exon_length[as.character(rownames(prf.cln)),,F]
for (i in 1:nrow(exon_length.cln)) {
if (rownames(exon_length.cln)[i] != rownames(prf.cln)[i]) {
stop(paste0(i, " Wrong"))
}
}

dat <- prf.cln/exon_length.cln$Length
dat_FPKM <- t(t(dat)/colSums(prf.cln)) * 10^9
dat_TPM <- t(t(dat)/colSums(dat))*10^6

dat_count <- prf.cln[order(rownames(prf.cln)),]
dat_fpkm <- dat_FPKM[order(rownames(dat_FPKM)),]
dat_tpm <- dat_TPM[order(rownames(dat_TPM)),]
res <- list(count=dat_count,fpkm=dat_fpkm,tpm=dat_tpm)
return(res)
}

函数二对上一步函数得到的结果进行输出

result<-res
dir<-base_dir2
type="stringtie"

prf_out <- function(result,type="STAR"){
dir <- "../../Result/Profile/"
if(!dir.exists(dir)){
dir.create(dir)
}
count_name <<- paste0(dir,type,"_filtered_counts.tsv")
FPKM_name <- paste0(dir,type,"_filtered_FPKM.tsv")
TPM_name <- paste0(dir,type,"_filtered_TPM.tsv")
write.table(x = result$count,
file = count_name,
sep = '\t',
quote = F,
col.names = NA)

write.table(x = result$fpkm,
file = FPKM_name,
sep = '\t',
quote = F,
col.names = NA)

write.table(x = result$tpm,
file = TPM_name,
sep = '\t',
quote = F,
col.names = NA)
}

kallisto_prf <- get_profile(kallisto_count,kallisto_length)# output
prf_out(kallisto_prf,type = "kallisto")