由于pmbc3样本量太大,目前设备跑完需要大约14个小时,检测代码是否能够正常运行的时间成本较高,因此采用将pmbc3样本中的2700个细胞,人工选择300个进行测试。
运行代码如下。
setwd("D:/R/project/project1")
rm(list = ls())
library(Seurat)
######-----使用不同的数据-----######
#安装SeuratData包
#install.packages("remotes")
#remotes::install_github("satijalab/seurat-data")
library(SeuratData)
#AvailableData()#SeuratData数据库中包含的数据集
#第一次一般要安装pbmc3k数据:InstallData("pbmc3k"),再引用
#InstallData("pbmc3k")
data("pbmc3k")
exprMat <- as.matrix(pbmc3k@assays$RNA@data)
####-----人工筛选的300细胞(直接选择矩阵前300列)-----####
exprMat <- exprMat[,1:300]
dim(exprMat)
exprMat[1:4,1:4]
cellInfo <- pbmc3k@meta.data[,c(4,2,3)]
#cellInfo变量也需要进行修改
cellInfo <- cellInfo[1:300,]
dim(cellInfo)
colnames(cellInfo)=c('CellType', 'nGene' ,'nUMI')
head(cellInfo)
table(cellInfo$CellType)
####-----下面的部分和第一部分的相同-----####
### Initialize settings
library(SCENIC)
#报错:“object 'motifAnnotations_hgnc' not found“
data(list="motifAnnotations_hgnc_v9", package="RcisTarget")
motifAnnotations_hgnc <- motifAnnotations_hgnc_v9
# 保证cisTarget_databases 文件夹下面有下载好2个1G的文件,注意这个时候使用的是人的hg19的数据
scenicOptions <- initializeScenic(org="hgnc", dbDir="D:/R/project/project1/cisTarget_databases_hg19", nCores=1)
saveRDS(scenicOptions, file="D:/R/project/project1/int/scenicOptions.Rds")
### Co-expression network
genesKept <- geneFiltering(exprMat, scenicOptions)
exprMat_filtered <- exprMat[genesKept, ]
exprMat_filtered[1:4,1:4]
dim(exprMat_filtered)
runCorrelation(exprMat_filtered, scenicOptions)
exprMat_filtered_log <- log2(exprMat_filtered+1)
runGenie3(exprMat_filtered_log, scenicOptions)
library(foreach)
library(iterators)
library(parallel)
library(doParallel)
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] # Toy run settings
scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
scenicOptions <- runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) # Toy run settings
scenicOptions <- runSCENIC_3_scoreCells(scenicOptions, exprMat_filtered_log)
scenicOptions <- runSCENIC_4_aucell_binarize(scenicOptions)
tsneAUC(scenicOptions, aucType="AUC") # choose settings
运行log(可能存在一定出入,但关键点没有问题)如下:
> source("D:/R/project/project1/300cells.R", echo=TRUE)
> setwd("D:/R/project/project1")
> rm(list = ls())
> library(Seurat)
The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
(status 2 uses the sf package in place of rgdal)
Attaching SeuratObject
> ######-----使用不同的数据-----######
> #安装SeuratData包
> #install.packages("remotes")
> #remotes::install_github("satijalab/seurat-data")
> library(SeuratDa .... [TRUNCATED]
> #AvailableData()#SeuratData数据库中包含的数据集
> #第一次一般要安装pbmc3k数据:InstallData("pbmc3k"),再引用
> #InstallData("pbmc3k")
> data("pbmc3k")
> exprMat <- as.matrix(pbmc3k@assays$RNA@data)
> exprMat <- exprMat[,1:300]
> dim(exprMat)
[1] 13714 300
> exprMat[1:4,1:4]
AAACATACAACCAC AAACATTGAGCTAC AAACATTGATCAGC AAACCGTGCTTCCG
AL627309.1 0 0 0 0
AP006222.2 0 0 0 0
RP11-206L10.2 0 0 0 0
RP11-206L10.9 0 0 0 0
> cellInfo <- pbmc3k@meta.data[,c(4,2,3)]
> cellInfo <- cellInfo[1:300,]
> dim(cellInfo)
[1] 300 3
> colnames(cellInfo)=c('CellType', 'nGene' ,'nUMI')
> head(cellInfo)
CellType nGene nUMI
AAACATACAACCAC Memory CD4 T 2419 779
AAACATTGAGCTAC B 4903 1352
AAACATTGATCAGC Memory CD4 T 3147 1129
AAACCGTGCTTCCG CD14+ Mono 2639 960
AAACCGTGTATGCG NK 980 521
AAACGCACTGGTAC Memory CD4 T 2163 781
> table(cellInfo$CellType)
Naive CD4 T Memory CD4 T CD14+ Mono B CD8 T FCGR3A+ Mono NK DC Platelet
79 42 50 43 28 20 21 5 3
> #b <- c()
> #i <- 1
> #for(i in 1:length(a))
> #{
> # if(a[i] == 0)
> # {
> # b <- append(b,i)
> # }
>
>
> #}
> #exprMat <- exprMat[-b,]
> .... [TRUNCATED]
> #报错:“object 'motifAnnotations_hgnc' not found“
> data(list="motifAnnotations_hgnc_v9", package="RcisTarget")
> motifAnnotations_hgnc <- motifAnnotations_hgnc_v9
> # 保证cisTarget_databases 文件夹下面有下载好2个1G的文件,注意这个时候使用的是人的hg19的数据
> scenicOptions <- initializeScenic(org="hgnc", dbDir="D:/R/project/project1/cisTarget_ ..." ... [TRUNCATED]
Motif databases selected:
hg19-500bp-upstream-7species.mc9nr.feather
hg19-tss-centered-10kb-7species.mc9nr.feather
The tzdb package is not installed. Timezones will not be available to Arrow compute functions.
Using the column 'features' as feature index for the ranking database.
Using the column 'features' as feature index for the ranking database.
> saveRDS(scenicOptions, file="D:/R/project/project1/int/scenicOptions.Rds")
> ### Co-expression network
> genesKept <- geneFiltering(exprMat, scenicOptions)
Maximum value in the expression matrix: 419
Ratio of detected vs non-detected: 0.066
Number of counts (in the dataset units) per gene:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 2.00 6.00 52.77 20.00 19117.00
Number of cells in which each gene is detected:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 2.00 6.00 18.57 18.00 300.00
Number of genes left after applying the following filters (sequential):
5590 genes with counts per gene > 9
5568 genes detected in more than 3 cells
Using the column 'features' as feature index for the ranking database.
5141 genes available in RcisTarget database
Gene list saved in int/1.1_genesKept.Rds
> exprMat_filtered <- exprMat[genesKept, ]
> exprMat_filtered[1:4,1:4]
AAACATACAACCAC AAACATTGAGCTAC AAACATTGATCAGC AAACCGTGCTTCCG
NOC2L 0 0 0 0
HES4 0 0 0 0
ISG15 0 0 1 9
TNFRSF18 0 2 0 0
> dim(exprMat_filtered)
[1] 5141 300
> runCorrelation(exprMat_filtered, scenicOptions)
> exprMat_filtered_log <- log2(exprMat_filtered+1)
> runGenie3(exprMat_filtered_log, scenicOptions)
Using 461 TFs as potential regulators...
Running GENIE3 part 1
Running GENIE3 part 10
Running GENIE3 part 2
Running GENIE3 part 3
Running GENIE3 part 4
Running GENIE3 part 5
Running GENIE3 part 6
Running GENIE3 part 7
Running GENIE3 part 8
Running GENIE3 part 9
Finished running GENIE3.
> library(foreach)
> library(iterators)
> library(parallel)
> library(doParallel)
> #runSCENIC_1_coexNetwork2modules(scenicOptions)
> #runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget"))
>
> #runSCENIC_3_scoreC .... [TRUNCATED]
> scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
19:56 Creating TF modules
Number of links between TFs and targets (weight>=0.001): 1252165
[,1]
nTFs 461
nTargets 5141
nGeneSets 3688
nLinks 2011914
> scenicOptions <- runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) # Toy run settings
19:56 Step 2. Identifying regulons
载入程辑包:‘AUCell’
The following object is masked from ‘package:SCENIC’:
plotEmb_rgb
Using the column 'features' as feature index for the ranking database.
tfModulesSummary:
[,1]
top5perTarget 314
19:56 RcisTarget: Calculating AUC
Using the column 'features' as feature index for the ranking database.
Scoring database: [Source file: hg19-tss-centered-10kb-7species.mc9nr.feather]
20:03 RcisTarget: Adding motif annotation
Number of motifs in the initial enrichment: 91794
Number of motifs annotated to the matching TF: 1105
20:03 RcisTarget: Pruning targets
Using the column 'features' as feature index for the ranking database.
20:11 Number of motifs that support the regulons: 1105
Preview of motif enrichment saved as: output/Step2_MotifEnrichment_preview.html
> scenicOptions <- runSCENIC_3_scoreCells(scenicOptions, exprMat_filtered_log)
20:12 Step 3. Analyzing the network activity in each individual cell
Number of regulons to evaluate on cells: 53
Biggest (non-extended) regulons:
SPI1 (320g)
FOS (108g)
IRF7 (51g)
SPIB (49g)
TAF1 (44g)
XBP1 (31g)
CEBPD (28g)
RXRA (21g)
ELF2 (19g)
STAT1 (18g)
Quantiles for the number of genes detected by cell:
(Non-detected genes are shuffled at the end of the ranking. Keep it in mind when choosing the threshold for calculating the AUC).
min 1% 5% 10% 50% 100%
256.00 287.94 358.80 429.90 704.00 1834.00
Warning in .AUCell_calcAUC(geneSets = geneSets, rankings = rankings, nCores = nCores, :
Using only the first 287.94 genes (aucMaxRank) to calculate the AUC.
20:12 Finished running AUCell.
20:12 Plotting heatmap...
20:12 Plotting t-SNEs...
> scenicOptions <- runSCENIC_4_aucell_binarize(scenicOptions)
Binary regulon activity: 36 TF regulons x 290 cells.
(51 regulons including 'extended' versions)
36 regulons are active in more than 1% (2.9) cells.
> tsneAUC(scenicOptions, aucType="AUC") # choose settings
[1] "int/tSNE_AUC_50pcs_30perpl.Rds"
Warning messages:
1: In runGenie3(exprMat_filtered_log, scenicOptions) :
Only 461 (25%) of the 1839 TFs in the database were found in the dataset. Do they use the same gene IDs?
2: In RcisTarget::addLogo(tableSubset, motifCol = motifCol, dbVersion = dbVersion, :
There is no annotation version attribute in the input table (it has probably been loaded with an older version of the package).'v9' will be used as it was the old default,but we recommend to re-load the annotations and/or re-run the enrichment to make sure everything is consistent.