seurat流程
#定义函数
norm_range = function(x){
ids = summary(x)
iqr = 1.5*(ids[5] - ids[2])
print(ids)
print(paste0('norm_range:',ids[2]-iqr,'-',ids[5]+iqr))
}
# 1.构建对象
min.cells = 0 # min.cells 某一个基因至少在多少个基因中表达
min.features = 0 # min.features 某个细胞至少表达多少个基因
sce = CreateSeuratObject(counts = raw.data,metadata = metadata,min.cells =min.cells,min.features =min.features)
sce = AddMetaData(object = sce,metadata = metadata)
sce = AddMetaData(object = sce, percent.ercc, col.name = "percent.ercc")
# 2.数据清洗
# 用数据框的筛选形式可以对sce进行基因和样本筛选
### QC_filter all cells
erccs = grep('^ERCC-', x= rownames(sce),value = T) # value = T 获取名字
rp = grep("^RP[SL][[:digit:]]", x= rownames(sce),value = T) # value = T 获取名字
mt = grep('^MT-', x= rownames(sce),value = T) # value = T 获取名字)
pHB = grep('^HBA|^HBB', x= rownames(sce),value = T) # value = T 获取名字) # 红细胞marker
ncRNA = grep("^[A-Z][A-Z][0-9]*\\.[0-9]", x= rownames(sce),value = T)
LOC = grep('(^LOC|LINC)[1-9]*', x= rownames(sce),value = T) # value = T 获取名字)
sce[["percent.ercc"]] = PercentageFeatureSet(sce, pattern = "^ERCC-")
sce[["percent.rp"]] = PercentageFeatureSet(sce, pattern = "^RP[SL][[:digit:]]")
sce[["percent.mt"]] = PercentageFeatureSet(sce, pattern = "^MT-")
sce[["percent.hb"]] = PercentageFeatureSet(sce, pattern = "^HBA|^HBB")
sce[["percent.ncRNA"]] = PercentageFeatureSet(sce, pattern =("^[A-Z][A-Z][0-9]*\\.[0-9]")
sce[["percent.LOC"]] = PercentageFeatureSet(sce, pattern = "(^LOC|LINC)[1-9]*")
norm_range(sce@meta.data$nCount_RNA)
norm_range(sce@meta.data$nFeature_RNA)
norm_range(sce@meta.data$percent.mt)
norm_range(sce@meta.data$percent.hb)
norm_range(sce@meta.data$percent.ncRNA)
norm_range(sce@meta.data$percent.LOC)
# 3. 筛选
sce = subset(x=sce, subset = nCount_RNA > 50000 & nFeature_RNA > 500)
### genes filter
ids = c(erccs,LOC,mt,ncRNA,rp) %>% unique
sce = sce[-which(rownames(sce) %in% ids),]
sce
#####################################################################################################################
# seurat 流程
# 1.log
sce = NormalizeData(object = sce,normalization.method = "LogNormalize", scale.factor = 1e6)
# 2.高变基因
sce = FindVariableFeatures(object = sce,selection.method = "vst", nfeatures = 2000)
# 3.标准化
sce = ScaleData(object = sce)
# 4. PCA
sce = RunPCA(object = sce, do.print = FALSE)
# 5.构建图
sce= FindNeighbors(sce, dims = 1:20)
# 6. 聚类
sce = FindClusters(sce, resolution = 0.5)
# 7.tsne
sce=RunTSNE(sce,dims.use = 1:20) ##tsne降维
#####################################################################################################################
sce.2 <- SCTransform(sce, method = "glmGamPoi", vars.to.regress = "percent.mt", verbose = FALSE)
# 4. PCA
sce.2 = RunPCA(object = sce.2, do.print = FALSE)
# 5.构建图
sce.2= FindNeighbors(sce.2, dims = 1:20)
# 6. 聚类
sce.2 = FindClusters(sce.2, resolution = 0.5)
# 7.tsne
sce.2=RunTSNE(sce.2,dims.use = 1:20) ##tsne降维
my Dimplot
ids.group = 'CellType' # 注释后的label信息 ,改为cell_type,这里可以设置成自己的分组
ids.title = 'Number of CT-26: 9,181'
umap = sce@reductions$umap@cell.embeddings %>% #获取坐标信息
as.data.frame() %>%
cbind(cell_type = sce[[ids.group]])
cell_type_med <- umap %>%
group_by(cell_type) %>%
summarise(
UMAP_1 = median(UMAP_1),
UMAP_2 = median(UMAP_2)
)
DimPlot(sce, group.by =ids.group)+
scale_color_npg(alpha=0.7) + ggtitle(ids.title)+ # 设置title
geom_label_repel(aes(UMAP_1,UMAP_2,label=cell_type), fontface="bold",data = cell_type_med,
point.padding=unit(0.5, "lines"))+
theme(
axis.title = element_blank(), #轴标题
axis.text = element_blank(), # 文本
axis.ticks = element_blank(),
axis.line = element_blank())+
geom_segment(aes(x = min(umap$UMAP_1) , y = min(umap$UMAP_2) ,
xend = min(umap$UMAP_1) +3, yend = min(umap$UMAP_2) ),
colour = "black", size=1,arrow = arrow(length = unit(0.3,"cm")))+
geom_segment(aes(x = min(umap$UMAP_1) , y = min(umap$UMAP_2) ,
xend = min(umap$UMAP_1) , yend = min(umap$UMAP_2) + 3.5),
colour = "black", size=1,arrow = arrow(length = unit(0.3,"cm"))) +
annotate("text", x = min(umap$UMAP_1) +1.5, y = min(umap$UMAP_2) -1, label = "UMAP_1",
color="black",size = 4) +
annotate("text", x = min(umap$UMAP_1) -1, y = min(umap$UMAP_2) + 1.5, label = "UMAP_2",
color="black",size = 4,angle=90)
ggsave('Result/plts/UMAP_Cell_experimental.pdf',height = 20,width = 25,units = 'cm')
myFindMarkers
{
# 参数设置
ids.group = 'TissueType' # 分组信息
ids.min.pct = 0.25 # 差异基因最低表达比例
ids.mc.cores = 24 # 设置所调用的线程数量
ids.path = 'Result/csv/DEGs_of_TissueType/Markers_of_' # 设置文件输出路径和前缀
unique(Idents(sce))
Idents(sce) = sce[[ids.group]]
M=unique(Idents(sce))
print(M)
# 设置多线程识别差异基因
if(T){
FindMarker.wrapper <- function(x){
FindMarkers(sce,ident.1=x, only.pos = TRUE, min.pct=ids.min.pct)
}
Markers <- parallel::mclapply(M, FindMarker.wrapper, mc.cores = ids.mc.cores)
DEGs = paste0('DEGs.',M)
df = NULL
for (i in seq_along(DEGs)) {
ids.df = as.data.frame(Markers[i])
ids.df$Tissue = M[i]
write.csv(ids.df,file = paste0(ids.path,M[i],'.csv'))
}
df = as.data.frame(df)
head(df)
}
}
CellPhoneDB 流程
if(T){
sce = sce.raw
cell.1 <