Seurat包的打分函数AddModuleScore

本文介绍了Seurat包中的AddModuleScore函数,通过计算基因集在单个细胞中的表达得分,结合控制特征的随机比较,提供了基因集在细胞层次的量化评估。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

在单细胞数据分析的过程中,Seurat包提供了一个为一个基因集打分的函数AddModuleScore(自定基因集),为基因集进行打分常见的富集分析软件GSVA,今天我们来看看Seurat这个函数的用法和意义。

library(Seurat)
?AddModuleScore

给出的解释说明是Calculate module scores for feature expression programs in single cells,也就是计算单个细胞在一个基因集的score值。
描述是这样的

Calculate the average expression levels of each program (cluster) on 
single cell level, subtracted by the aggregated expression of control 
feature sets. All analyzed features are binned based on averaged 
expression, and the control features are randomly selected from each bin.

看完这个解释,云里雾里,不知所云。
再来看看用法和参数:

Usage:

     AddModuleScore(
       object,
       features,
       pool = NULL,
       nbin = 24,
       ctrl = 100,
       k = FALSE,
       assay = NULL,
       name = "Cluster",
       seed = 1,
       search = FALSE,
       ...
     )
Arguments:

  object: Seurat object

features: Feature expression programs in list

    pool: List of features to check expression levels agains, defaults
          to ‘rownames(x = object)’

    nbin: Number of bins of aggregate expression levels for all
          analyzed features

    ctrl: Number of control features selected from the same bin per
          analyzed feature

       k: Use feature clusters returned from DoKMeans

   assay: Name of assay to use

    name: Name for the expression programs

    seed: Set a random seed. If NULL, seed is not set.
   search: Search for symbol synonyms for features in ‘features’ that
          don't match features in ‘object’? Searches the HGNC's gene
          names database; see ‘UpdateSymbolList’ for more details

     ...: Extra parameters passed to ‘UpdateSymbolList’

计算结果会返回一个数值,有正有负,今天我们的任务就是来参透这个函数,既然看解释不明白,那就只能看原函数了。

function (object, features, pool = NULL, nbin = 24, ctrl = 100, 
    k = FALSE, assay = NULL, name = "Cluster", seed = 1, search = FALSE, 
    ...) 
{
    if (!is.null(x = seed)) {
        set.seed(seed = seed)
    }
    assay.old <- DefaultAssay(object = object)
    assay <- assay %||% assay.old
    DefaultAssay(object = object) <- assay
    assay.data <- GetAssayData(object = object)
    features.old <- features
    if (k) {
        .NotYetUsed(arg = "k")
        features <- list()
        for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) {
            features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster == 
                i))
        }
        cluster.length <- length(x = features)
    }
    else {
        if (is.null(x = features)) {
            stop("Missing input feature list")
        }
        features <- lapply(X = features, FUN = function(x) {
            missing.features <- setdiff(x = x, y = rownames(x = object))
            if (length(x = missing.features) > 0) {
                warning("The following features are not present in the object: ", 
                  paste(missing.features, collapse = ", "), ifelse(test = search, 
                    yes = ", attempting to find updated synonyms", 
                    no = ", not searching for symbol synonyms"), 
                  call. = FALSE, immediate. = TRUE)
                if (search) {
                  tryCatch(expr = {
                    updated.features <- UpdateSymbolList(symbols = missing.features, 
                      ...)
                    names(x = updated.features) <- missing.features
                    for (miss in names(x = updated.features)) {
                      index <- which(x == miss)
                      x[index] <- updated.features[miss]
                    }
                  }, error = function(...) {
                    warning("Could not reach HGNC's gene names database", 
                      call. = FALSE, immediate. = TRUE)
                  })
                  missing.features <- setdiff(x = x, y = rownames(x = object))
                  if (length(x = missing.features) > 0) {
                    warning("The following features are still not present in the object: ", 
                      paste(missing.features, collapse = ", "), 
                      call. = FALSE, immediate. = TRUE)
                  }
                }
            }
            return(intersect(x = x, y = rownames(x = object)))
        })
        cluster.length <- length(x = features)
    }
    if (!all(LengthCheck(values = features))) {
        warning(paste("Could not find enough features in the object from the following feature lists:", 
            paste(names(x = which(x = !LengthCheck(values = features)))), 
            "Attempting to match case..."))
        features <- lapply(X = features.old, FUN = CaseMatch, 
            match = rownames(x = object))
    }
    if (!all(LengthCheck(values = features))) {
        stop(paste("The following feature lists do not have enough features present in the object:", 
            paste(names(x = which(x = !LengthCheck(values = features)))), 
            "exiting..."))
    }
    pool <- pool %||% rownames(x = object)
    data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
    data.avg <- data.avg[order(data.avg)]
    data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30, 
        n = nbin, labels = FALSE, right = FALSE)
    names(x = data.cut) <- names(x = data.avg)
    ctrl.use <- vector(mode = "list", length = cluster.length)
    for (i in 1:cluster.length) {
        features.use <- features[[i]]
        for (j in 1:length(x = features.use)) {
            ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which(x = data.cut == 
                data.cut[features.use[j]])], size = ctrl, replace = FALSE)))
        }
    }
    ctrl.use <- lapply(X = ctrl.use, FUN = unique)
    ctrl.scores <- matrix(data = numeric(length = 1L), nrow = length(x = ctrl.use), 
        ncol = ncol(x = object))
    for (i in 1:length(ctrl.use)) {
        features.use <- ctrl.use[[i]]
        ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use, 
            ])
    }
    features.scores <- matrix(data = numeric(length = 1L), nrow = cluster.length, 
        ncol = ncol(x = object))
    for (i in 1:cluster.length) {
        features.use <- features[[i]]
        data.use <- assay.data[features.use, , drop = FALSE]
        features.scores[i, ] <- Matrix::colMeans(x = data.use)
    }
    features.scores.use <- features.scores - ctrl.scores
    rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
    features.scores.use <- as.data.frame(x = t(x = features.scores.use))
    rownames(x = features.scores.use) <- colnames(x = object)
    object[[colnames(x = features.scores.use)]] <- features.scores.use
    CheckGC()
    DefaultAssay(object = object) <- assay.old
    return(object)
}

我们来一步一步解析这个函数
我们先用它的默认值

library(Seurat)
load(Rdata)
assay.old <- DefaultAssay(object = object)  ###结果为RNA
assay <- assay %||% assay.old   ### %||%的用法可自行查询,这里还是显示RNA
DefaultAssay(object = object) <- assay 
assay.data <- GetAssayData(object = object)
features.old <- features  ## 运行到这里,得到矩阵信息assay.data和gene list。
###这里我们慢慢运行
cluster.length <- length(x = features)
missing.features <- setdiff(x = features, y = rownames(x = object))  ##setdiff函数的用法 我们这里显然是不会有不同的。
###接下来我们就可以直接跳到
pool <- pool %||% rownames(x = object)
data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
data.avg <- data.avg[order(data.avg)]    ###排序,从小到大
data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30, 
n = nbin, labels = FALSE, right = FALSE)
###rnorm(n, mean = 0, sd = 1) n 为产生随机值个数(长度),mean 是平均数, sd 是标准差 ,也就是说这里将数据切成了nbin份(由小到大)。
names(x = data.cut) <- names(x = data.avg)
ctrl.use <- vector(mode = "list", length = cluster.length)  ##空的列表
 for (i in 1:cluster.length) {
        features.use <- features[[i]]
        for (j in 1:length(x = features.use)) {
            ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which(x = data.cut == 
                data.cut[features.use[j]])], size = ctrl, replace = FALSE)))
        }
    }
ctrl.use <- lapply(X = ctrl.use, FUN = unique)
ctrl.scores <- matrix(data = numeric(length = 1L), nrow = length(x = ctrl.use), 
        ncol = ncol(x = object))   ####空的
for (i in 1:length(ctrl.use)) {
        features.use <- ctrl.use[[i]]
        ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use, 
            ])
    }       #######细胞的平均值
features.scores.use <- features.scores - ctrl.scores
    rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
    features.scores.use <- as.data.frame(x = t(x = features.scores.use))
    rownames(x = features.scores.use) <- colnames(x = object)
    object[[colnames(x = features.scores.use)]] <- features.scores.use
    CheckGC()
    DefaultAssay(object = object) <- assay.old

走到这里,相信大家应该都明白了,也就是我们感兴趣的基因,抽出来,每一个细胞算一个这些基因表达的平均值,
背景基因的平均值在于找每个基因的所在的bin,在该bin内随机抽取相应的ctrl个基因作为背景,最后所有的目标基因算一个平均值,所有的背景基因算一个平均值,两者相减就是该gene set 的score值。
至于生物学意义,仁者见仁智者见智了。

<think>好的,用户需要寻找用于评估斑马鱼T细胞状态的R代码示例。首先,我需要考虑斑马鱼T细胞分析常用的方法。单细胞RNA测序(scRNA-seq)是常用的技术,因此应该涉及相关R,如Seurat或Scanpy。不过用户明确提到R语言,所以重点放在Seurat上。 接下来,用户提到的“状态评估”可能括细胞聚类、标记基因识别和功能注释。根据引用[3],细胞注释通常需要参考数据库或文献中的标记基因。斑马鱼作为模式生物,可能有特定的标记基因,比如cd4、cd8等。需要确保代码中含这些基因的检测步骤。 引用[4]中的PAGA图用于发育轨迹分析,这可能与细胞状态转变有关。虽然用户没有明确提到轨迹分析,但状态评估可能括发育阶段或激活状态的确定。因此可以考虑添加轨迹推断的步骤,比如使用monocle3或类似工具。 另外,数据预处理步骤也很重要,括标准化、降维和聚类。需要检查代码示例是否涵盖这些基础分析步骤。同时,可视化部分如UMAP图和热图展示标记基因,能帮助用户直观评估细胞状态。 考虑到用户可能不熟悉具体流程,代码示例应尽量详细并附有注释。同时,提醒用户注意数据质量控制,如线粒体基因比例过滤,确保分析可靠性。最后,引用相关文献和工具,如引用[3]中的参考数据库和标记基因比较方法,以及引用[4]中的轨迹分析,以增强答案的可信度。 需要确认提供的代码是否符合R语言的最新版本,避免过时的函数。此外,斑马鱼可能使用Ensembl基因ID,需提醒用户注意基因ID转换问题。最后,生成相关问题,帮助用户深入理解后续步骤,如数据整合、轨迹分析和多组学方法,引用相关文献支持。</think>以下是一个基于单细胞转录组数据分析斑马鱼T细胞状态的R代码示例,整合了细胞注释、标记基因分析和可视化流程: ```r # 加载必要的 library(Seurat) library(clustree) library(celldex) library(SingleR) library(scater) # 1. 数据预处理 tcell.data <- Read10X(data.dir = "path/to/your/filtered_feature_bc_matrix") tcell <- CreateSeuratObject(counts = tcell.data, project = "Zebrafish_Tcells", min.cells = 3, min.features = 200) # 质量控制 tcell[["percent.mt"]] <- PercentageFeatureSet(tcell, pattern = "^mt-") tcell <- subset(tcell, subset = nFeature_RNA > 200 & percent.mt < 20) # 标准化和数据缩放 tcell <- NormalizeData(tcell) tcell <- FindVariableFeatures(tcell, selection.method = "vst", nfeatures = 2000) tcell <- ScaleData(tcell) # 2. 降维和聚类 tcell <- RunPCA(tcell, features = VariableFeatures(object = tcell)) tcell <- FindNeighbors(tcell, dims = 1:15) tcell <- FindClusters(tcell, resolution = c(0.2, 0.4, 0.6)) tcell <- RunUMAP(tcell, dims = 1:15) # 3. 细胞类型注释(参考引用[3]方法) # 加载斑马鱼参考数据集 ref <- BlueprintEncodeData() pred <- SingleR(test = as.SingleCellExperiment(tcell), ref = ref, labels = ref$label.main) # 添加注释结果 tcell$celltype <- pred$labels # 4. T细胞亚群分析 tcell.markers <- FindAllMarkers(tcell, only.pos = TRUE, min.pct = 0.25) top10 <- tcell.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) # 可视化标记基因 VlnPlot(tcell, features = c("cd4", "cd8a", "foxp3a", "il2ra")) FeaturePlot(tcell, features = c("lck", "rag1", "stat3")) # 5. 状态评估(激活/抑制状态) tcell <- AddModuleScore(tcell, features = list(Activation = c("il2", "ifng", "tnfa"), Exhaustion = c("pdcd1", "lag3", "ctla4")), name = "State_") # 可视化功能状态 FeaturePlot(tcell, features = c("State_Activation1", "State_Exhaustion1")) ``` ### 关键分析步骤说明: 1. **数据预处理**:含质量控制步骤,通过线粒体基因比例过滤低质量细胞[^3] 2. **细胞注释**:采用SingleR方法整合参考数据集进行自动注释[^3] 3. **标记基因分析**:识别T细胞亚群特征基因(如cd4、cd8等) 4. **功能状态评估**:通过基因模块评分量化细胞激活/耗竭状态 ### 注意事项: - 斑马鱼基因命名可能需要转换为同源人类基因名(使用biomaRt) - T细胞发育轨迹分析可结合PAGA方法(引用[4]) - 建议样本量>1000个细胞以保证分析可靠性[^1]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值