library(tidyverse)
library(data.table)
library(GenomicFeatures)
# 求出每个外显子的长度
txdb <- makeTxDbFromGFF("Homo_sapiens.GRCh38.101.gtf",format="gtf")
a <- read.csv("counts.csv")
normalizeGeneCounts <- function(counts, TxDb, method) {
require("GenomicFeatures")
length <- width(genes(TxDb))/10 ^ 3 #gene length per kb
if (method == "CPM") {
normalized <-
as.data.frame(apply(counts,2, function(x) (x/sum(x)) * 10 ^ 6))
} else if (method == "RPKM") {
normalized <-
as.data.frame(t(apply(counts, 1, "/", colSums(counts) / 10 ^ 6)) / length)
} else if (method == "TPM") {
normalized <-
as.data.frame(t(apply(
counts / length, 1, "/", colSums(counts / length) / 10 ^ 6
)))
} else {
stop("method must be CPM, RPKM or TPM")
}
return(normalized)
}
TPM <- normalizeGeneCounts(counts, TxDb = txdb, method = "TPM")
count to tpm转化的函数记录一下
但是单细胞normalization用tpm不太好,因为太过稀疏转录数量和基因长度无明显相关性
#取出表达量矩阵转化成类TPM格式
TPM <- expm1(integrated[["RNA"]]@counts)
#读取所有假基因文件
all <- read.table("all.txt",header = TRUE)
#按名称索引基因名列
genename <- all$gene_name
#取表达量矩阵的行名
row<-rownames(TPM)#取行名
#取两个字符串的交集
useful <- intersect(row,genename)
#筛选特定行
MYH16 <- TPM[useful,]
#筛选表达量大于10的行
MYH16 <- MYH16[which(rowSums(MYH16) > 10),]
#查看可表达的基因名
row.names(MYH16)
#计算分组基因表达量
Idents(integrated) <- "cell_type"#可以换不同的分组
AverageExp <- AverageExpression(integrated)
expr <- AverageExp$RNA#取RNA slot
# 增加分组前缀,这里增加的是"Cluster"
#for(i in 1:ncol(expr)){colnames(expr)[i] = paste("Cluster", colnames(expr)[i],sep = "")}
allpseu = read.table("D:/脑谱图/假基因gtf文件/allpseu.txt")
a <- allpseu$gene_name
row <- rownames(expr)
useful <- intersect(row,a)
expr <- expr[useful,]
ey <- expr[which(rowSums(expr)>0),]
将RNA的数据scale一下存为second+.rds