愿武艺晴小朋友一定要每天都开心
1. input文件路径,ArchR只需要样本的atac_fragments.tsv.gz文件
input.file.list <-c("./Fragments/GSM4138888_scATAC_BMMC_D5T1.fragments.tsv.gz",
"./Fragments/GSM4138889_scATAC_BMMC_D6T1.fragments.tsv.gz",
"./Fragments/GSM4138890_scATAC_CD34_D7T1.fragments.tsv.gz",
"./Fragments/GSM4138891_scATAC_CD34_D8T1.fragments.tsv.gz",
"./Fragments/GSM4138892_scATAC_CD34_D9T1.fragments.tsv.gz",
"./Fragments/GSM4138893_scATAC_PBMC_D10T1.fragments.tsv.gz",
"./Fragments/GSM4138894_scATAC_PBMC_D11T1.fragments.tsv.gz",
"./Fragments/GSM4138895_scATAC_PBMC_D12T1.fragments.tsv.gz",
"./Fragments/GSM4138896_scATAC_PBMC_D12T2.fragments.tsv.gz",
"./Fragments/GSM4138897_scATAC_PBMC_D12T3.fragments.tsv.gz")
#设置样本名,相当于seurat的orig.ident
sampleNames=c("BMMC_D5T1","BMMC_D6T1","CD34_D7T1","CD34_D8T1","CD34_D9T1","PBMC_D10T1","PBMC_D11T1","PBMC_D12T1","PBMC_D12T2","PBMC_D12T3")
2.创建Arrow文件 (耗时1h左右,需要耐心等待,可以保存个ArchRProject的备份)
ArrowFiles <- createArrowFiles(
inputFiles = input.file.list,
sampleNames = sampleNames,
minTSS = 9.3, #这个参数在作者8的基础上调整;目标是细胞总数跟作者(35038)大差不差
minFrags = 1100, #在作者1000基础上调整;目标是细胞总数跟作者(35038)大差不差
minFragSize = 10, # 默认genomeAnnotation = genomeAnnotation,
maxFragSize = 2000, # 默认
excludeChr = c("chrM", "chrY"), # 或者:excludeChr = c("chrM","chrY","chrX"),
addTileMat = TRUE,
force = TRUE, #强制覆盖之前的Arrow文件
addGeneScoreMat = TRUE) #在当前目录下生成一个"QualityControl"目录
ArrowFiles #查看是否是一个存放Arrow文件路径的向量
## 我从greenleaf那里分享两张图;需要深刻理解Arrow文件意思和运行原理:
#评估双细胞分数for every single cell
doubScores <- addDoubletScores(input = ArrowFiles,
k = 10, #或10 #Refers to how many cells near a "pseudo-doublet" to count.
knnMethod = "UMAP",
LSIMethod = 1)
#创建一个ArchR-Project
projHeme1 <- ArchRProject(ArrowFiles = ArrowFiles,
outputDirectory="Hematopoiesis-reappear-atac-original",
copyArrows=TRUE)
然后降维部分仔细梳理了一下估计是应该分为两个大的步骤:
一、 使用LSI得到PeakMatrix矩阵
## 4.1 TF-IDF
projHeme2 <- addIterativeLSI(
ArchRProj = projHeme2,
useMatrix = "TileMatrix", #1)先利用TileMatrix矩阵
name = "IterativeLSI-TileMatrix",
force = TRUE,iterations = 4, ## 增加迭代次数,追求达到收敛标准
varFeatures = 20000, ## varFeatures的降低代表着去关注变异更大的那些峰;调一个最小值:1000;10000;看一看形状是不是和作者像
dimsToUse = 2:25,
clusterParams = list(resolution = 0.8)) # 分辨率的降低,降低数据粒度,减少数据批次效应
## 4.2 得到TileMatrix来源的Cluster
projHeme2 <- addClusters(
input = projHeme2,
reducedDims = "IterativeLSI-TileMatrix",
method = "Seurat",
name = "TileMatrix_Clusters",
force = TRUE, dimsToUse = 2:25, ##
# maxClusters = 36,
resolution = 0.8)
##4.3 在"IterativeLSI-TileMatrix"降维聚类以后在进行线性UMAP降维 ;
projHeme2 <- addUMAP(
ArchRProj = projHeme2,
reducedDims = "IterativeLSI-TileMatrix", #
name = "UMAP_TileMatrix", #
force = TRUE,
nNeighbors = 55,
n_components =2,
minDist = 0.45)
## 4.4 以样本为对象进行可视化展示
dev.off() %>% dev.off()%>% dev.off()%>% dev.off()%>% dev.off()%>%dev.off()%>% dev.off()%>% dev.off()%>% dev.off() ## 这里ArchR里内置函数会打开很多图层,都把它关掉,才能在运行plotEmbedding时,直接在plots图框中看见图 ^_^
plotEmbedding(ArchRProj = projHeme2,
colorBy = "cellColData",embedding = "UMAP_TileMatrix",
name = "Sample", #或者映射给Clusters #"ScranClusters", #"Sample"
alpha=0.3)+ theme(legend.position = "None") #
## 4.3 拿到TileMatrix来源的Cluster后进行拟混池重复
library('BSgenome.Hsapiens.UCSC.hg19')
projHeme2 <- addGroupCoverages(ArchRProj=projHeme2, groupBy = "TileMatrix_Clusters")
projHeme2 <- addReproduciblePeakSet( #添加可再现峰集
ArchRProj = projHeme2,
groupBy = "TileMatrix_Clusters", maxPeaks = 200000,
pathToMacs2 = "/home/**/miniconda3/envs/Rlibrary/bin/macs2")
## 4.4 拿到PeakMatrix矩阵
projHeme2 <- addPeakMatrix(projHeme2)
getAvailableMatrices(projHeme2)
# [1] "GeneScoreMatrix" "PeakMatrix" "TileMatrix" ^_^是有的哈
二、 以scATAC-seq为中心的 :LSI聚类和可视化
## 4.5 用拿到的PeakMatrix矩阵重新计算TF-IDF
projHeme2 <- addIterativeLSI(
ArchRProj = projHeme2,
useMatrix = "PeakMatrix", # 用PeakMatrix重新计算TF-IDF
name = "PeakMatrix-IterativeLSI", # 给PeakMatrix进行迭代LSI的降维
iterations = 4,
force = TRUE,
dimsToUse = 1:50, ## 1)1:50
clusterParams = list(resolution = 1.5,dimsToUse = 1:25)) ##1) 1.5 2) 3
## (可选步骤) 如果在下游还是发现有明显的批次效应,推荐使用Harmony矫正批次效应
projHeme2 <- addHarmony(
ArchRProj = projHeme2,
reducedDims = "PeakMatrix-IterativeLSI",
name = "Harmony",
groupBy = "Sample")
## 这一步会在我们的projHeme2创建一个新的名为Harmony的reduceDims对象
## 可以通过projHeme2@reducedDims$Harmony访问
## 4.6 在得到的"PeakMatrix-IterativeLSI"降维对象中:再次使用Seurat的FindClusters()函数进行聚类;其中addClusters函数不会决定addUMAP函数的结果和位置,但addClusters要先运行(聚类)才能在addUMAP函数执行时,进行cluster的嵌入(embedding)
projHeme2 <- addClusters(
input = projHeme2,
reducedDims ="PeakMatrix-IterativeLSI",
method = "Seurat",
name = "PeakMatrix-Clusters",
force = TRUE,
dimsToUse = 1:50,
resolution = 1.5,
maxClusters = 36)
## 4.7 在"PeakMatrix-IterativeLSI"降维聚类以后在进行线性UMAP降维 ;
projHeme2 <- addUMAP(
ArchRProj = projHeme2,
reducedDims = "PeakMatrix-IterativeLSI", #或 Harmony
name = "UMAP", #TSNE 或UMAP
force = TRUE,
nNeighbors = 55,
n_components =2,
minDist = 0.45)
## 4.8 使用MAGIC进行(标记)基因得分的可视化
projHeme2 <- addImputeWeights(projHeme2,
reducedDims = "PeakMatrix-IterativeLSI") #dimsToUse = 25
##可以用cowplot绘制所有的标记基因
markerGenes <- c("CD34","CD14","CEBPB","CD3D","CD19","GATA1") #
p <- plotEmbedding(
ArchRProj = projHeme2,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
imputeWeights = getImputeWeights(projHeme2)) #
#Rearrange for grid plotting #用cowplot绘制所有的标记基因
p2 <- lapply(p, function(x){
x + guides(color = FALSE,fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())})
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))
## 无论您选择哪种方法,输入参数都会对结果嵌入产生剧烈影响,影响UMAP坐标的数值?
其中ArchR官方教程中对关键降维函数:LSI;有这样一段描述:它具有不确定性,以完全相同的参数完全相同的方式运行LSI:LSI将高度相似,但并不完全相同。
因此要仔仔细细的对上面参数进行调整:适合自己的才是最好的,每个数据都会有它自己的特点。
例如我对varFeatures参数的调整如下(去明白varFeatures参数影响,其他保持不变):
因此:varFeatures参数 =7000 感觉是比较合适的
对于分辨率resolution 参数的调整方法差不多:
resolution =0.83 感觉是比较合适的
然后,在addIterativeLSI; Seurat::dimsToUse的值似乎对UMAP的值无影响?
然后,在addIterativeLSI; dimsToUse 在SVD奇异值分解那的,变异度较大
还是不太像greenleaf作者的。。。。。。先回睡一觉明天再说吧