手把手教你R语言CIBERSORT计算免疫浸润+Rproject的使用

18kkk

已于 2023-03-05 10:26:25 修改

阅读量2.6w

点赞数 31

分类专栏： R语言文章标签： r语言

于 2022-04-18 18:42:38 首次发布

本文链接：https://blog.csdn.net/m0_58549466/article/details/124255582

版权

R语言专栏收录该内容

18 篇文章

订阅专栏

写在开头：

本文介绍了CIBERSORT两种使用方法，大家可以自行选择，方法二简单些，方法一原始些

本文顺便倡议大家使用Rproject来管理代码，感谢生信技能树jimmy老师让我知道了这么方便的玩意，再也不用拼命setwd()和getwd()了，不想看这部分可以直接下滑。

CIBERSORTx是原版网站，建议大家去学习，并且学习他们发的经典文章

鸣谢：生信技能树jimmy老师和 Biomamba 生信基地 BIOMAMBA老师

使用Rproject管理R项目

1.提前在你想要储存代码的地方建一个文件夹，然后打开Rstudio 中选择 NEW Project

2.选择Existing Directory，因为刚刚已经提前建好了文件夹。如果刚刚没有建，选NEW Directory就会给你建一个，属于个人习惯，都可以。

3.建立Project，比如我在zhangming这个文件夹下建立，creat后即可建立

4.在zhangming文件夹中出现了相关的project文件

5.双击这个文件，就会直接定位在这个工作路径，而不需要切换setwd()

6.再新建script,新建的code都会在同一个工作环境/工作文件夹内，非常方便，尤其是对于大量的代码学习，可以很好的节省时间。当然如果你不喜欢这样，也可以直接进入下面的CIBERSORT学习。具体不懂的地方还可以看其他人对project的解释，亲测很好上手。

下面进入CIBERSORT的学习

首先安装包

# install packages 这三个安装不成功的话，就安后面的bseqsc包也行
install.packages('e1071')
install.pacakges('parallel')
install.packages('preprocessCore')
library(e1071)
library(preprocessCore)
library(parallel)

install.packages('devtools')
library(devtools)
devtools::install_github('shenorrlab/bseqsc')
library(bseqsc)#这个包携带大量CIBERSORT的依赖，前三个安装不好可以安装他

方法一：自行创造函数法，较复杂。新手建议方法二

此法使用Cibersort工具需要三个文件：
1、sourcecibersort.R
2、LM22.txt
3、genes_exp.txt

1.sourcecibersort.R

直接把下列代码新建一个script，然后保存，保存名字为sourcecibersort.R

#' CIBERSORT R script v1.03 (last updated 07-10-2015)
#' Note: Signature matrix construction is not currently available; use java version for full functionality.
#' Author: Aaron M. Newman, Stanford University (amnewman@stanford.edu)
#' Requirements:
#'       R v3.0 or later. (dependencies below might not work properly with earlier versions)
#'       install.packages('e1071')
#'       install.pacakges('parallel')
#'       install.packages('preprocessCore')
#'       if preprocessCore is not available in the repositories you have selected, run the following:
#'           source("http://bioconductor.org/biocLite.R")
#'           biocLite("preprocessCore")
#' Windows users using the R GUI may need to Run as Administrator to install or update packages.
#' This script uses 3 parallel processes.  Since Windows does not support forking, this script will run
#' single-threaded in Windows.
#'
#' Usage:
#'       Navigate to directory containing R script
#'
#'   In R:
#'       source('CIBERSORT.R')
#'       results <- CIBERSORT('sig_matrix_file.txt','mixture_file.txt', perm, QN)
#'
#'       Options:
#'       i)  perm = No. permutations; set to >=100 to calculate p-values (default = 0)
#'       ii) QN = Quantile normalization of input mixture (default = TRUE)
#'
#' Input: signature matrix and mixture file, formatted as specified at http://cibersort.stanford.edu/tutorial.php
#' Output: matrix object containing all results and tabular data written to disk 'CIBERSORT-Results.txt'
#' License: http://cibersort.stanford.edu/CIBERSORT_License.txt
#' Core algorithm
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
CoreAlg <- function(X, y){
  
  #try different values of nu
  svn_itor <- 3
  
  res <- function(i){
    if(i==1){nus <- 0.25}
    if(i==2){nus <- 0.5}
    if(i==3){nus <- 0.75}
    model<-e1071::svm(X,y,type="nu-regression",kernel="linear",nu=nus,scale=F)
    model
  }
  
  if(Sys.info()['sysname'] == 'Windows') out <- parallel::mclapply(1:svn_itor, res, mc.cores=1) else
    out <- parallel::mclapply(1:svn_itor, res, mc.cores=svn_itor)
  
  nusvm <- rep(0,svn_itor)
  corrv <- rep(0,svn_itor)
  
  #do cibersort
  t <- 1
  while(t <= svn_itor) {
    weights = t(out[[t]]$coefs) %*% out[[t]]$SV
    weights[which(weights<0)]<-0
    w<-weights/sum(weights)
    u <- sweep(X,MARGIN=2,w,'*')
    k <- apply(u, 1, sum)
    nusvm[t] <- sqrt((mean((k - y)^2)))
    corrv[t] <- cor(k, y)
    t <- t + 1
  }
  
  #pick best model
  rmses <- nusvm
  mn <- which.min(rmses)
  model <- out[[mn]]
  
  #get and normalize coefficients
  q <- t(model$coefs) %*% model$SV
  q[which(q<0)]<-0
  w <- (q/sum(q))
  
  mix_rmse <- rmses[mn]
  mix_r <- corrv[mn]
  
  newList <- list("w" = w, "mix_rmse" = mix_rmse, "mix_r" = mix_r)
  
}

#' do permutations
#' @param perm Number of permutations
#' @param X cell-specific gene expression
#' @param y mixed expression per sample
#' @export
doPerm <- function(perm, X, Y){
  itor <- 1
  Ylist <- as.list(data.matrix(Y))
  dist <- matrix()
  
  while(itor <= perm){
    #print(itor)
    
    #random mixture
    yr <- as.numeric(Ylist[sample(length(Ylist),dim(X)[1])])
    
    #standardize mixture
    yr <- (yr - mean(yr)) / sd(yr)
    
    #run CIBERSORT core algorithm
    result <- CoreAlg(X, yr)
    
    mix_r <- result$mix_r
    
    #store correlation
    if(itor == 1) {dist <- mix_r}
    else {dist <- rbind(dist, mix_r)}
    
    itor <- itor + 1
  }
  newList <- list("dist" = dist)
}

#' Main functions
#' @param sig_matrix file path to gene expression from isolated cells
#' @param mixture_file heterogenous mixed expression
#' @param perm Number of permutations
#' @param QN Perform quantile normalization or not (TRUE/FALSE)
#' @export
CIBERSORT <- function(sig_matrix, mixture_file, perm=0, QN=TRUE){
  
  #read in data
  X <- read.table(sig_matrix,header=T,sep="\t",row.names=1,check.names=F)
  Y <- read.table(mixture_file, header=T, sep="\t", row.names=1,check.names=F)
  
  X <- data.matrix(X)
  Y <- data.matrix(Y)
  
  #order
  X <- X[order(rownames(X)),]
  Y <- Y[order(rownames(Y)),]
  
  P <- perm #number of permutations
  
  #anti-log if max < 50 in mixture file
  if(max(Y) < 50) {Y <- 2^Y}
  
  #quantile normalization of mixture file
  if(QN == TRUE){
    tmpc <- colnames(Y)
    tmpr <- rownames(Y)
    Y <- preprocessCore::normalize.quantiles(Y)
    colnames(Y) <- tmpc
    rownames(Y) <- tmpr
  }
  
  #intersect genes
  Xgns <- row.names(X)
  Ygns <- row.names(Y)
  YintX <- Ygns %in% Xgns
  Y <- Y[YintX,]
  XintY <- Xgns %in% row.names(Y)
  X <- X[XintY,]
  
  #standardize sig matrix
  X <- (X - mean(X)) / sd(as.vector(X))
  
  #empirical null distribution of correlation coefficients
  if(P > 0) {nulldist <- sort(doPerm(P, X, Y)$dist)}
  
  #print(nulldist)
  
  header <- c('Mixture',colnames(X),"P-value","Correlation","RMSE")
  #print(header)
  
  output <- matrix()
  itor <- 1
  mixtures <- dim(Y)[2]
  pval <- 9999
  
  #iterate through mixtures
  while(itor <= mixtures){
    
    y <- Y[,itor]
    
    #standardize mixture
    y <- (y - mean(y)) / sd(y)
    
    #run SVR core algorithm
    result <- CoreAlg(X, y)
    
    #get results
    w <- result$w
    mix_r <- result$mix_r
    mix_rmse <- result$mix_rmse
    
    #calculate p-value
    if(P > 0) {pval <- 1 - (which.min(abs(nulldist - mix_r)) / length(nulldist))}
    
    #print output
    out <- c(colnames(Y)[itor],w,pval,mix_r,mix_rmse)
    if(itor == 1) {output <- out}
    else {output <- rbind(output, out)}
    
    itor <- itor + 1
    
  }
  
  #save results
  write.table(rbind(header,output), file="CIBERSORT-Results.txt", sep="\t", row.names=F, col.names=F, quote=F)
  
  #return matrix object containing all results
  obj <- rbind(header,output)
  obj <- obj[,-1]
  obj <- obj[-1,]
  obj <- matrix(as.numeric(unlist(obj)),nrow=nrow(obj))
  rownames(obj) <- colnames(Y)
  colnames(obj) <- c(colnames(X),"P-value","Correlation","RMSE")
  obj
}

2.LM22.txt（文末网盘），用来作参考的免疫浸润数据集，也可以官网下载，是一样的。

3.genes_exp.txt（文末网盘），实际上就是你要处理的自己的数据或者TCGA的数据

4.启动

source("sourcecibersort.R") #启动这个函数，必须在哦那个一个文件夹内才可哟
results <- CIBERSORT(sig_matrix ="LM22.txt", mixture_file ="genes_exp.txt", perm = 1000, QN = T)
# perm置换次数=1000，QN分位数归一化=TRUE
# 文件名可以自定义
# 得到的结果可以用来绘制热图等等

5.之后的分析和方法二一样，都是对results进行绘图

方法二：使用打包的函数法，这个需要下载R包，但不需要自己创造函数了

1.还是运行以下代码，多安了个包

# install packages 这三个安装不成功的话，就安后面的bseqsc包也行
install.packages('e1071')
install.pacakges('parallel')
install.packages('preprocessCore')
library(e1071)
library(preprocessCore)
library(parallel)

install.packages('devtools')
library(devtools)
devtools::install_github('shenorrlab/bseqsc')
library(bseqsc)#这个包携带大量CIBERSORT的依赖，前三个安装不好可以安装他

################安装CIBERSORT包##########################################################
if(!require(CIBERSORT))devtools::install_github("Moonerss/CIBERSORT")
library(CIBERSORT)
# 包全部安装完成

# 画热图的包
install.packages("pheatmap")
install.packages("ComplexHeatmap")
library(ggplot2)
library(pheatmap)
library(ComplexHeatmap)

安装好以后就可以使用cibersort函数了

2.它的好处在于自带了LM22文件和测试文件，你不需要额外去下载和整理了。我本人已经测试过这个LM22和自己读取的LM22完全一样。直接运行下列code即可出图

# 同时准备好LM22的TXT文件，注意自己以后的文件要和这个TXT的格式一样
# 加载CIBERSORT包成功后，系统内部会自带data(LM22)
data(LM22) 
data(mixed_expr)#TCGA的演示数据，正式情况下就用自己的数据

# 正式开始探索
# 看5*5的数据
LM22[1:5,1:5]
mixed_expr[1:5,1:5]

# 分别定义signature矩阵LM22和我的数据（演示）矩阵mixed_expr
results <- cibersort(sig_matrix = LM22, mixture_file = mixed_expr)

# 理解一下results的结果
# 你可以理解为返回了一个列名为细胞类型、行名为样本名的细胞浸润程度（占比）的矩阵
# 此外result中还会多出三列：
# P-value: 用来展示去卷积的结果在所有细胞类群中是否具有差异
# Correlation:参考矩阵与输入矩阵的特征基因相关性
# RMSE: Root mean squared error，参考矩阵与输入矩阵的特征基因标准差

# heatmap
# 按行（样本内部）标准化可以看出在各类样本内部，M2浸润程度（占比）最高
rowscale <- results[,1:ncol(LM22)]#只是相当于备份了一下results
rowscale <- rowscale[,apply(rowscale, 2, function(x){sum(x)>0})]#删除全是0的列
pheatmap(rowscale,
         scale = 'row',#按行标准化，不标准化就会按绝对值显示，很诡异
         cluster_col=T,#是否对列聚类，不聚类，坐标轴就按照原来的顺序显示
         cluster_row=F,#是否对行聚类
         angle_col = "315")#调整X轴坐标的倾斜角度

# 各类样本之间也具有自己占比高的特异性免疫细胞
columnscale <- results[,1:ncol(LM22)]
columnscale <- columnscale[,apply(columnscale, 2, function(x){sum(x)>0})]#删除全是0的列
pheatmap(columnscale,
         scale = 'column',
         cluster_col=F,
         cluster_row=T,
         angle_col = "315")

# 堆积比例图
my36colors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3', '#476D87','#E95C59', '#E59CC4', '#AB3282', '#23452F', '#BD956A', '#8C549C', '#585658','#9FA3A8', '#E0D4CA', '#5F3D69', '#C5DEBA', '#58A4C3', '#E4C755', '#F7F398','#AA9A59', '#E63863', '#E39A35', '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B', '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963','#968175'
)
cellnum <- results[,1:ncol(LM22)]
cell.prop<- apply(cellnum, 1, function(x){x/sum(x)})
data4plot <- data.frame()
for (i in 1:ncol(cell.prop)) {
  data4plot <- rbind(
    data4plot,
    cbind(cell.prop[,i],rownames(cell.prop),
          rep(colnames(cell.prop)[i],nrow(cell.prop)
          )
    )
  )
}

colnames(data4plot)<-c('proportion','celltype','sample')
data4plot$proportion <- as.numeric(data4plot$proportion)
ggplot(data4plot,aes(sample,proportion,fill=celltype))+
  geom_bar(stat="identity",position="fill")+
  scale_fill_manual(values=my36colors)+#自定义fill的颜色
  ggtitle("cell portation")+
  theme_bw()+
  theme(axis.ticks.length=unit(0.5,'cm'),axis.title.x=element_text(size=1))+
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))+#把x坐标轴横过来
  guides(fill=guide_legend(title=NULL))

LM22：

#########链接：https://pan.baidu.com/s/1eQSEekekozS5osgydwzk1w
#@####提取码：fk88

LM22read <- read.csv("LM22.csv",header  = T)
gene <- LM22read[,1]
rownames(LM22read) <- gene
LM22read <- LM22read[,-1]
data(LM22)
all(LM22==LM22read)#可以看到TURE，说明两个文件完全一样了；LM22是上文提到的安装CIBERSORT包之后自带的data

鸣谢：生信技能树jimmy老师和 Biomamba 生信基地 BIOMAMBA老师

有疑问可以邮件联系我，会尽力帮忙：yunbk@mail2.sysu.edu.cn

2023.03.05更新

结合不少小伙伴的私信和邮件更新几个点

1.自己的表达矩阵格式：按照作者的文档和示例数据，应该是不取log的，也不能做其他的normalizie处理。当然如果是log处理过的，cibersort会自己去log，所以也不必担心。数据应该是标准化后的数据比如FPKM TPM CPM这样，不应该用count。

2.在数据量比较大，样本多的时候，请下载R包使用cibersort，因为source函数的读取速度会变得很慢。

3.计算前需要排除空值（NA）值，不然会报错

4.最新的版本是cibersortX,但是好像现在维护有点问题，大家可以自行搜索一下这个方法