Rtsne的问题总结

32 篇文章 1 订阅
本文探讨了在R和Python中使用TSNE进行数据可视化时,结果出现显著差异的原因。作者通过模拟数据集发现,主要差异源于预处理步骤,特别是归一化方法。R中的标准化采用总和比例,而Python的scanpy库使用了中位数归一化。通过调整R中的预处理流程以匹配scanpy的中位数归一化,结果变得更加一致。这表明预处理在单细胞数据分析中的重要性。
摘要由CSDN通过智能技术生成

今天在造dropout模拟数据集的时候,发现了一个问题,Rtsne的结果和skelarn中的tsne画图结果非常不同,让我感到很好奇,到底是什么原因导致这个的

%load_ext rpy2.ipython
%%R -o counts -o truecounts -o geneinfo -o cellinfo

# make sure that splatter is installed: https://github.com/Oshlack/splatter
suppressPackageStartupMessages({
  library(splatter)
  library(Rtsne)
  library(ggplot2)
  library(repr)
})


simulate <- function(nGroups=2, nGenes=200, batchCells=2000, dropout=3)
{
    if (nGroups > 1) method <- 'groups'
    else             method <- 'single'

    group.prob <- rep(1, nGroups) / nGroups

    # new splatter requires dropout.type
    if ('dropout.type' %in% slotNames(newSplatParams())) {
        if (dropout)
            dropout.type <- 'experiment'
        else
            dropout.type <- 'none'
        
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.type=dropout.type, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)

    } else {
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.present=!dropout, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)        
    }

    counts     <- as.data.frame(t(counts(sim)))
    truecounts <- as.data.frame(t(assays(sim)$TrueCounts))

    dropout    <- assays(sim)$Dropout
    mode(dropout) <- 'integer'

    cellinfo   <- as.data.frame(colData(sim))
    geneinfo   <- as.data.frame(rowData(sim))

    list(counts=counts,
         cellinfo=cellinfo,
         geneinfo=geneinfo,
         truecounts=truecounts)
}
sim <- simulate(nGroups=6, dropout=1) ## 因为这个地方的dropout从1变成了3,导致两者不一致

counts <- sim$counts
geneinfo <- sim$geneinfo
cellinfo <- sim$cellinfo
truecounts <- sim$truecounts

X <- t(counts) ## counts with dropout
Y <- as.integer(substring(cellinfo$Group,6))
Y <- Y-1

X.normalized <- apply(X, 2, function(z) z/sum(z)) ## 我大概知道这个图的原因了,因为sc.ppnor
X.normalized <- log(X.normalized + 1)
tsne.X <- Rtsne(t(X.normalized), max_iter = 1000)
  
tsne_plot.X <- data.frame(`x-tsne` = tsne.X$Y[,1], `y-tsne` = tsne.X$Y[,2], 
                        truelabel = Y, check.names = F)
tsne_plot.X$truelabel <- factor(tsne_plot.X$truelabel, levels = c(0:max(Y)))

ggplot(tsne_plot.X) + geom_point(aes(x=`x-tsne`, y=`y-tsne`, color=truelabel), size=0.5) +
          labs(color='true label') +
          ggtitle("Simulated data") +
          theme_bw() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                legend.key = element_rect(fill = 'white', colour = 'white'), legend.position="none",
                axis.title.y=element_blank(), axis.title.x=element_blank())

# sim <- simulate()

# counts <- sim$counts
# geneinfo <- sim$geneinfo
# cellinfo <- sim$cellinfo
# truecounts <- sim$truecounts
## 我也可以在R中显示的

结果如下
在这里插入图片描述

visulize in python

import scanpy as sc 
adata = sc.AnnData(counts.values, obs=cellinfo, var=geneinfo)
adata.obs_names = cellinfo.Cell
adata.var_names = geneinfo.Gene

adata.obs.index = list(adata.obs.index) ## add this line to avoid error
adata.var.index = list(adata.var.index) ## add this line to avoid error


#sc.pp.filter_genes(adata, min_counts=1)


adata_true = sc.AnnData(truecounts.values, obs=cellinfo, var=geneinfo)
adata_true.obs_names = cellinfo.Cell
adata_true.var_names = geneinfo.Gene
adata_true.obs.index = list(adata_true.obs.index) ## add this line to avoid error
adata_true.var.index = list(adata_true.var.index) ## add this line to avoid error

adata_true = adata_true[:, adata.var_names].copy()

sc.pp.normalize_per_cell(adata)
sc.pp.normalize_per_cell(adata_true)

sc.pp.log1p(adata)
sc.pp.log1p(adata_true)

sc.pp.pca(adata)
sc.pp.pca(adata_true)

sc.tl.tsne(adata)
sc.tl.tsne(adata_true)

sc.pp.neighbors(adata)
sc.pp.neighbors(adata_true)
sc.pl.tsne(adata,color=["Group"],title="dropout_tsne")
sc.pl.tsne(adata_true,color=["Group"],title="true_tsne")

sc.tl.umap(adata)
sc.tl.umap(adata_true)
sc.pl.umap(adata,color=["Group"],title="dropout_umap")
sc.pl.umap(adata_true,color=["Group"],title="true_out")

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述可以看到这结果的tsne看起来是不一样的,这个结果很奇怪,Rtsne有两个cluster竟然显示出来了,而python中的tsne直接混合了数据

我首先排除了可能是Rtsne的原因

# X_raw=adata.X.copy()
# Y=adata.obs["Group"].cat.codes.values

X_raw=adata_true.X.copy()
Y=adata_true.obs["Group"].cat.codes.values

%%R -i X_raw -i Y
tsne.X <- Rtsne(X_raw, max_iter = 1000)
  
tsne_plot.X <- data.frame(`x-tsne` = tsne.X$Y[,1], `y-tsne` = tsne.X$Y[,2], 
                        truelabel = Y, check.names = F)
tsne_plot.X$truelabel <- factor(tsne_plot.X$truelabel, levels = c(0:max(Y)))

ggplot(tsne_plot.X) + geom_point(aes(x=`x-tsne`, y=`y-tsne`, color=truelabel), size=0.5) +
          labs(color='true label') +
          ggtitle("Simulated data") +
          theme_bw() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                legend.key = element_rect(fill = 'white', colour = 'white'), legend.position="none",
                axis.title.y=element_blank(), axis.title.x=element_blank())

结果如下
在这里插入图片描述所以唯一的问题,就是这个预处理的过程
R的预处理流程如下
在这里插入图片描述而scanpy的处理流程如下
在这里插入图片描述根据我前几天刚看的sc.pp.normal_per_cell的实现原理,里面涉及到中位数,但是R中的标准化并没有,也就是说scanpy得到的结果可能会更稳定,这也就是结果的原因
我验证了这个结果,我在R中实现了中位数的标准化,

%%R -o counts -o truecounts -o geneinfo -o cellinfo

# make sure that splatter is installed: https://github.com/Oshlack/splatter
suppressPackageStartupMessages({
  library(splatter)
  library(Rtsne)
  library(ggplot2)
  library(repr)
})


simulate <- function(nGroups=2, nGenes=200, batchCells=2000, dropout=3)
{
    if (nGroups > 1) method <- 'groups'
    else             method <- 'single'

    group.prob <- rep(1, nGroups) / nGroups

    # new splatter requires dropout.type
    if ('dropout.type' %in% slotNames(newSplatParams())) {
        if (dropout)
            dropout.type <- 'experiment'
        else
            dropout.type <- 'none'
        
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.type=dropout.type, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)

    } else {
        sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
                             dropout.present=!dropout, method=method,
                             seed=42, dropout.shape=-1, dropout.mid=dropout)        
    }

    counts     <- as.data.frame(t(counts(sim)))
    truecounts <- as.data.frame(t(assays(sim)$TrueCounts))

    dropout    <- assays(sim)$Dropout
    mode(dropout) <- 'integer'

    cellinfo   <- as.data.frame(colData(sim))
    geneinfo   <- as.data.frame(rowData(sim))

    list(counts=counts,
         cellinfo=cellinfo,
         geneinfo=geneinfo,
         truecounts=truecounts)
}
sim <- simulate(nGroups=6, dropout=1) ## 因为这个地方的dropout从1变成了3,导致两者不一致

counts <- sim$counts
geneinfo <- sim$geneinfo
cellinfo <- sim$cellinfo
truecounts <- sim$truecounts

X <- t(counts) ## counts with dropout
Y <- as.integer(substring(cellinfo$Group,6))
Y <- Y-1

#X.normalized <- apply(X, 2, function(z) z/sum(z))
X_t=t(X)
me=median(apply(X_t,1,sum))
sf=me/apply(X_t,1,sum)
X.normalized=diag(sf)%*%X_t

X.normalized <- log(X.normalized + 1)
tsne.X <- Rtsne(X.normalized, max_iter = 1000)
  
tsne_plot.X <- data.frame(`x-tsne` = tsne.X$Y[,1], `y-tsne` = tsne.X$Y[,2], 
                        truelabel = Y, check.names = F)
tsne_plot.X$truelabel <- factor(tsne_plot.X$truelabel, levels = c(0:max(Y)))

ggplot(tsne_plot.X) + geom_point(aes(x=`x-tsne`, y=`y-tsne`, color=truelabel), size=0.5) +
          labs(color='true label') +
          ggtitle("Simulated data") +
          theme_bw() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
                legend.key = element_rect(fill = 'white', colour = 'white'), legend.position="none",
                axis.title.y=element_blank(), axis.title.x=element_blank())

# sim <- simulate()

# counts <- sim$counts
# geneinfo <- sim$geneinfo
# cellinfo <- sim$cellinfo
# truecounts <- sim$truecounts
## 我也可以在R中显示的

结果如下
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值