library(dplyr)
library(argparser)
library(tximport)
parameter input
parser <- arg_parser("merge transcript abundance files") %>%
add_argument("-s","--sample",help = "the sampleID files") %>%
add_argument("-d","--dir",help = "the dir of output") %>%
add_argument("-g", "--gene",help = "transcriptID into geneID") %>%
add_argument("-t","--type",help = "the type of software") %>%
add_argument("-c","--count",help = "the method of scale for featurecounts") %>%
add_argument("-n", "--name",help = "the name of transcript file") %>%
add_argument("-o","--out",help = "the merged files's dir and name")
args <- parse_args(parser)
prepare for function sample <- args$s
dir <- args$d
tx2gen <- args$g
type < args$t
scount <- args$c
name <args$n
prefix <- args$o
tx2gene_table <- read.csv(tx2gen, header=T)
tx2gene <- tx2gene_table%>%dplyr::select(tx_id,gene_id, gene_name)
phen <- read.table(sample, header=T,sep="\t")
sample_name <- unique(as.character(phen$SampleID))
files <- file.path(dir,sample_name, name)
names(files) <- sample_name
result <- tximport(kal_dirs,type = "kallisto",countsFromAbundance = "no", tx2gene = tx2gene, ignoreTxVersion = T)
save(result, file = prefix)
加载tximport进行不同样本结果导入
library(tximport)
base_dir <- "Downloads/kallisto_quant"
base_dir <- "/share/home/jianglab/yuanmengyi/MI_RNA/kallisto_quant"
base_dir2<-"/share/home/jianglab/yuanmengyi/MI_RNA/"
sample_id <- dir(file.path(base_dir))
sample_id<-paste0(sample_id,"_ballgown_out_dir")
kal_dirs <- sapply(sample_id, function(id) file.path(base_dir, id,"abundance.h5"))
stringtie_dirs <- sapply(sample_id, function(id) file.path(base_dir2, id,"t_data.ctab"))
s2c <- read.table("Downloads/design_matrix.txt", header = TRUE, sep='\t',stringsAsFactors=FALSE)
s2c <- read.table("./design_matrix.txt", header = TRUE, sep='\t',stringsAsFactors=FALSE)
s2c <- dplyr::mutate(s2c, path = kal_dirs)
s2c <- dplyr::mutate(s2c, path = stringtie_dirs)
kallisto_tsv <- tximport(kal_dirs,type = "kallisto",countsFromAbundance = "no", tx2gene = t2g, ignoreTxVersion = T)
stringtie_tsv <- tximport(stringtie_dir,type = "",countsFromAbundance = "no", tx2gene = t2g, ignoreTxVersion = T)
##stringtie_tsv <- tximport(stringtie_dirs, type = "stringtie", tx2gene = t2g)
加载ensembl数据库中的transcript2gene.txt,这里可以不需要链接,我已经下载好。
t2g <- read_csv("Downloads/Ensembl_trans2gene.csv")
t2g <- read_csv("./Ensembl_trans2gene.csv")
t2g<- t2g[,-1]
library(biomaRt)
mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "mmusculus_gene_ensembl",
host = 'ensembl.org')
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id",
"external_gene_name"), mart = mart)
t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id,
ens_gene = ensembl_gene_id, ext_gene = external_gene_name)
t2g <- read_csv("Downloads/Ensembl_trans2gene.csv")
t2g<-t2g[,-1]
colnames(t2g)<-c("tx_id","gene_id","gene_name")
读入phen信息,并对读入后的sampleID,做到与counts矩阵的列名一一对应的更改。
library(dplyr)
library(tibble)
result<-kallisto_tsv
phen <- read_csv("Downloads/sample_RNA.csv")
phen <- read_csv("./sample_RNA.csv")
phen$SampleID<-gsub("-",".",phen$SampleID)
phen$SampleID<-paste0("X",phen$SampleID)
提取counts信息
kallisto_count<-round(kallisto_tsv$counts) %>% data.frame()
stringtie_count<-round(stringtie_tsv$counts) %>% data.frame()
kallisto_length<-apply(kallisto_tsv$length,1,mean) %>% data.frame() %>% setNames("Length")
stringtie_length<-apply(stringtie_tsv$length,1,mean) %>% data.frame() %>% setNames("Length")
这里我们跳出函数循环外来改动和赋值。
occurrence=0.2
ncount=10
exon_length<-stringtie_length
matrix<-stringtie_count
y<-phen
exon_length<-kallisto_length
函数一,过滤表达矩阵:使用 apply 函数计算每行非零元素的比例,筛选出满足比例大于 occurrence 的行。
使用 rowSums 函数计算每行的和,筛选出和大于 ncount 的行。
使用 intersect 函数找到基因表达矩阵中的样本ID和给定的表型数据框中的样本ID的交集。根据交集得到的样本ID,重新排列表型数据框和基因表达矩阵。
使用 FPKM(每千万读数)和 TPM(每百万读数)规范化基因表达矩阵
对规范化后的基因表达矩阵按照行名排序,最终返回一个包含 counts、FPKM 和 TPM 的列表。
这个函数的输入包括 matrix(基因表达矩阵)、exon_length(外显子长度信息)、
phen(表型数据框)、occurrence(用于过滤的阈值)和 ncount(用于过滤的阈值)。
函数的输出是一个包含 counts、FPKM 和 TPM 的列表。
get_profile<-function(matrix,exon_length,
y=phen,
occurrence=0.2,
ncount=10){
matrix=count_format
exon_length=count_length
occurrence=0.2
ncount=10
filter with occurrence and ncount
prf <- matrix %>% rownames_to_column("Type") %>%
filter(apply(dplyr::select(.,-one_of("Type")),1,
function(x){sum(x >0)/length(x)}) > occurrence) %>%
data.frame(.) %>%
column_to_rownames("Type")
colnames(prf)<-gsub("_ballgown_out_dir","",colnames(prf))
prf <- prf[rowSums(prf) > ncount,]
change sampleID
sid <- intersect(colnames(prf),y$SampleID)
phen.cln <- y %>% filter(SampleID%in%sid) %>% arrange(SampleID)
prf.cln <- prf %>% dplyr::select(as.character(phen.cln$SampleID))
determine the right order between profile and phenotype
for(i in 1:ncol(prf.cln)){
if(!(colnames(prf.cln)[i] == phen.cln$SampleID[i])){
stop(paste0(i," Wrong"))
}
}
colnames(prf.cln) <- phen.cln$SampleID
normalizate the profile using FPKM and TPM
exon_length.cln <- exon_length[as.character(rownames(prf.cln)),,F]
for (i in 1:nrow(exon_length.cln)) {
if (rownames(exon_length.cln)[i] != rownames(prf.cln)[i]) {
stop(paste0(i, " Wrong"))
}
}
dat <- prf.cln/exon_length.cln$Length
dat_FPKM <- t(t(dat)/colSums(prf.cln)) * 10^9
dat_TPM <- t(t(dat)/colSums(dat))*10^6
dat_count <- prf.cln[order(rownames(prf.cln)),]
dat_fpkm <- dat_FPKM[order(rownames(dat_FPKM)),]
dat_tpm <- dat_TPM[order(rownames(dat_TPM)),]
res <- list(count=dat_count,fpkm=dat_fpkm,tpm=dat_tpm)
return(res)
}
函数二 对上一步函数得到的结果进行输出
result<-res
dir<-base_dir2
type="stringtie"
prf_out <- function(result,type="STAR"){
dir <- "../../Result/Profile/"
if(!dir.exists(dir)){
dir.create(dir)
}
count_name <<- paste0(dir,type,"_filtered_counts.tsv")
FPKM_name <- paste0(dir,type,"_filtered_FPKM.tsv")
TPM_name <- paste0(dir,type,"_filtered_TPM.tsv")
write.table(x = result$count,
file = count_name,
sep = '\t',
quote = F,
col.names = NA)
write.table(x = result$fpkm,
file = FPKM_name,
sep = '\t',
quote = F,
col.names = NA)
write.table(x = result$tpm,
file = TPM_name,
sep = '\t',
quote = F,
col.names = NA)
}
kallisto_prf <- get_profile(kallisto_count,kallisto_length)# output
prf_out(kallisto_prf,type = "kallisto")