学习小组-D7

Lulu是小虎

已于 2024-06-21 11:30:51 修改

阅读量257

点赞数 3

分类专栏：花花的学习小组文章标签：学习

于 2024-06-21 09:28:34 首次发布

本文链接：https://blog.csdn.net/weixin_46817132/article/details/139849289

版权

花花的学习小组专栏收录该内容

7 篇文章 0 订阅

订阅专栏

untar("GSE231920_RAW 2.tar",exdir = "GSE231920_RAW 2")#解包
library(stringr)
fs = paste0("GSE231920_RAW/",dir("GSE231920_RAW 2/"))
fs#每个样本的数据文件
samples = dir("GSE231920_RAW 2/") %>% str_split_i(pattern = "_",i = 2) %>% unique();samples

lapply(fs, function(s){
#s = fs[1]
for(i in 1:length(samples)){
#i = 1
if(str_detect(s,samples[[i]])){
file.copy(s,paste0("01_data/",samples[[i]]))
}
}
})

#1.5.1 为每个样本创建单独的文件夹
ctr = function(s){
ns = paste0("01_data/",s)
if(!file.exists(ns))dir.create(ns,recursive = T)
}
lapply(samples, ctr)
#1.5.2 每个样本的三个文件复制到单独的文件夹
lapply(fs, function(s){
for(i in 1:length(samples)){
if(str_detect(s,samples[[i]])){
file.copy(s,paste0("01_data/",samples[[i]]))
}
}
})
#1.5.3 所有文件改名，去掉前缀
on = paste0("01_data/",dir("01_data/",recursive = T));on
nn = str_remove(on,"GSM\\d+_sample\d_");nn
file.rename(on,nn)
#修改文件名时，路径时要从工作目录之下开始写的
#比如01_data/sample5/GSM7306058_sample5_barcodes.tsv.gz可以，只写GSM7306058_sample5_barcodes.tsv.gz就不行

#批量读取，练习需要跳过
rm(list = ls())
library(Seurat)
rdaf = "sce.all.Rdata"
if(!file.exists(rdaf)){
f = dir("01_data/")
scelist = list() #创建空的列表，下面的for循环每执行一次，scelist里面就会多一个元素。
for(i in 1:length(f)){
pda <- Read10X(paste0("01_data/",f[[i]]))
scelist[[i]] <- CreateSeuratObject(counts = pda,
project = f[[i]],
min.cells = 3,
min.features = 200)
print(dim(scelist[[i]]))#输出每个文件的基因数和细胞数
}
sce.all = merge(scelist[[1]],scelist[-1]) #合并多个对象
sce.all = JoinLayers(sce.all)
#merge后，每个样本的表达矩阵是一个独立的的layer，JoinLayers是合并为一个表达矩阵
set.seed(335)
sce.all = subset(sce.all,downsample=700)#每个样本抽700个细胞
save(sce.all,file = rdaf)
}
load(rdaf)
head(sce.all@meta.data)
table(sce.all$orig.ident) #每个样本多少细胞
sum(table(Idents(sce.all)))#细胞总数

#质量控制
sce.all[["percent.mt"]] <- PercentageFeatureSet(sce.all, pattern = "^MT-")
sce.all[["percent.rp"]] <- PercentageFeatureSet(sce.all, pattern = "^RP[SL]")
sce.all[["percent.hb"]] <- PercentageFeatureSet(sce.all, pattern = "^HB[^(P)]")

head(sce.all@meta.data, 3)
VlnPlot(sce.all,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rp",
"percent.hb"),
ncol = 3,pt.size = 0.5, group.by = "orig.ident")

#根据小提琴图指定指标去掉离群值
sce.all = subset(sce.all,percent.mt < 20&
nCount_RNA < 40000 &
nFeature_RNA < 6000)
table(sce.all@meta.data$orig.ident)

#整合降维聚类分群harmony
f = "obj.Rdata"
library(harmony)
if(!file.exists(f)){
sce.all = sce.all %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(pc.genes = VariableFeatures(.)) %>%
RunHarmony("orig.ident") %>%
FindNeighbors(dims = 1:15, reduction = "harmony") %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15,reduction = "harmony") %>%
RunTSNE(dims = 1:15,reduction = "harmony")
save(sce.all,file = f)
}
load(f)
ElbowPlot(sce.all)

p1 = DimPlot(sce.all, reduction = "umap",label = T,pt.size = 0.5)+ NoLegend();p1
DimPlot(sce.all, reduction = "umap",pt.size = 0.5,group.by = "orig.ident")
#group.by = "orig.ident"就会按照样本来分配颜色

#细胞类型注释
library(celldex)
library(SingleR)
ls("package:celldex")
f = "../day6/ref_BlueprintEncode.RData"
#用了昨天文件夹里的数据，day6是昨天的文件夹名，按需修改
if(!file.exists(f)){
ref <- celldex::BlueprintEncodeData()
save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = sce.all
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = ref$label.main,
clusters = scRNA@active.ident)
pred.scRNA$pruned.labels
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
save(scRNA,file = "scRNA.Rdata")
p2 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p1+p2

#分组可视化及组间细胞比例比较
scRNA$seurat_annotations = Idents(scRNA)
#把细胞类型设置为meta.data中的一列
#看分组信息
library(tinyarray)
edat = geo_download("GSE231920")
pd = edat$pd
#从右边点pd可以看到pd的内容。(是点pd或者pd后面的空白，不是点pd前面的三角符号)

#添加分组信息
scRNA$group = ifelse(scRNA$orig.ident %in% c("sample1","sample2","sample3"), "treat","control")
DimPlot(scRNA, reduction = "umap", group.by = "group")

#计算每个亚群的细胞数量和占全部细胞的比例
# 每种细胞的数量和比例
cell_counts <- table(Idents(scRNA))
cell.all <- cbind(cell_counts = cell_counts,
cell_Freq = round(prop.table(cell_counts)*100,2))
#各组中每种细胞的数量和比例
cell.num.group <- table(Idents(scRNA), scRNA$group)
cell.freq.group <- round(prop.table(cell.num.group, margin = 2) *100,2)
cell.all = cbind(cell.all,cell.num.group,cell.freq.group)
cell.all = cell.all[,c(1,3,4,2,5,6)]
colnames(cell.all) = paste(rep(c("all","control","treat"),times = 2),
rep(c("count","freq"),each = 3),sep = "_")
cell.all

#找某一细胞内部的组间差异基因
sub.obj = subset(scRNA,idents = "NK cells")
dfmarkers <- FindMarkers(scRNA, ident.1 = "treat", group.by = "group",min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(dfmarkers)
# 找conserved marker基因
if(!require("multtest"))BiocManager::install('multtest')
if(!require("metap"))install.packages('metap')
sub.markers <- FindConservedMarkers(scRNA, ident.1 = "NK cells", grouping.var = "group", min.pct = 0.25, logfc.threshold = 0.25,verbose = F)
head(sub.markers)

#组间比较的气泡图
markers.to.plot = c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
"CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
"GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "PRSS57") #一组感兴趣的基因
#如果idents有NA会报错https://github.com/satijalab/seurat/issues/8772
#scRNA <- subset(scRNA, seurat_annotations %in% na.omit(scRNA$seurat_annotations))
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"),
dot.scale = 8, split.by = "group") +
RotatedAxis()

#单个基因FeaturePlot一行是一个基因，一列是一个分组，可以很直观的对比。
FeaturePlot(scRNA, features = c("CD3D", "GNLY", "IFI6"), split.by = "group", max.cutoff = 3, cols = c("grey","red"), reduction = "umap")

plots <- VlnPlot(scRNA, features = c("LYZ", "ISG15", "MS4A1"), split.by = "group", group.by = "seurat_annotations",
pt.size = 0, combine = FALSE)
library(patchwork)
wrap_plots(plots = plots, ncol = 1)

#伪bulk 转录组差异分析
bulk <- AggregateExpression(scRNA, return.seurat = T, slot = "counts", assays = "RNA", group.by = c("seurat_annotations","orig.ident", "group"))
colnames(bulk)#整合成了多个“样本”
#可以只取出来一种细胞
sub <- subset(bulk, seurat_annotations == "CD8+ T-cells")
colnames(sub)
Idents(sub) <- "group"
de_markers <- FindMarkers(sub, ident.1 = "treat", ident.2 = "control", slot = "counts", test.use = "DESeq2",
verbose = F)
de_markers$gene <- rownames(de_markers)

#火山图
k1 = de_markers$avg_log2FC< -1 & de_markers$p_val <0.01
k2 = de_markers$avg_log2FC> 1 & de_markers$p_val <0.01
de_markers$change <- ifelse(k1,"down",ifelse(k2,"up","not"))
#满足k1就是down，不满足的话再看是否满足k2，满足k2就是up，不满足就是not。
library(ggplot2)
ggplot(de_markers, aes(avg_log2FC, -log10(p_val),color = change)) +
geom_point(size = 2, alpha = 0.5) +
geom_vline(xintercept = c(1,-1),linetype = 4)+
geom_hline(yintercept = -log10(0.01),linetype = 4)+
scale_color_manual(values = c("#2874C5", "grey", "#f87669"))+
theme_bw() +
ylab("-log10(unadjusted p-value)")

Lulu是小虎

关注

3
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
学习小组-D7

"GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "PRSS57") #一组感兴趣的基因。#比如01_data/sample5/GSM7306058_sample5_barcodes.tsv.gz可以，只写GSM7306058_sample5_barcodes.tsv.gz就不行。scelist = list() #创建空的列表，下面的for循环每执行一次，scelist里面就会多一个元素。#从右边点pd可以看到pd的内容。
复制链接

扫一扫