下载了个表达矩阵,发现基因名称是ENSG,转换为gene symbol再构建seurat对象。
fs=list.files(path = data_path,pattern="txt.gz$")
dat=lapply(fs, function(i){
read.table(file.path(data_path,i),header = T,sep = "\t")
})
df = do.call(cbind,dat)
先把ENSG转换为gene symbol
oldnames <- rownames(df)
library(gprofiler2)
query1 <- gconvert(query = oldnames, numeric_ns= "ENTREZGENE_ACC",organism = "hsapiens", filter_na = FALSE)
# 把出现NA和重复的删除
newnames <- query1$name
newnames <- na.omit(newnames)
newnames <- newnames[which(!duplicated(newnames))]
# match只返回一个匹配的值 所以不会再出现重复的名字
pos <- match(newnames,query1$name)
# 提取子矩阵
df <- df[pos,]
rownames(df) <- newnames
# 构建seurat object
seurat_object<- CreateSeuratObject(counts = df)
完成!