Topic 1 Single_Cell_analysis(4)
文章目录
Part 1 Scissor() debug——binomial 数据处理
1.1 加载package
#加载package
library(Seurat)
library(Matrix)
library(Scissor)
library(preprocessCore)
1.2 加载单细胞数据
#单细胞数据下载
location <- "https://xialab.s3-us-west-2.amazonaws.com/Duanchen/Scissor_data/"
load(url(paste0(location, 'scRNA-seq.RData')))
dim(sc_dataset)
1.3 将单细胞数据转化为Seurat类型
#将数据转换为Seurat类型,并建立细胞与细胞之间的similarity网络,还会得到多种方法降维后的簇
sc_dataset <- Seurat_preprocessing(sc_dataset, verbose = F)
class(sc_dataset)
1.4 加载bulk数据
#bulk数据下载,TCGA中survival数据作为phenotype
load(url(paste0(location, 'TCGA_LUAD_exp2.RData')))
load(url(paste0(location, 'TCGA_LUAD_TP53_mutation.RData')))
1.5 parameter 赋值
family = "binomial"
phenotype <- TP53_mutation
tag <- c("wild-type", "TP53 mutant")
1.6 检查bulk-cell和single-cell的intersect
#取bulk和single的gene交集
common <- intersect(rownames(bulk_dataset), rownames(sc_dataset))
length(rownames(bulk_dataset))
length(rownames(sc_dataset))
length(common)
1.7 得到processed data 和 细胞间的network
sc_exprs<-as.matrix(sc_dataset@assays$RNA@data)
network<-as.matrix(sc_dataset@graphs$RNA_snn)
1.8 将network的对角元素置0,将所有非零元素置1
diag(network)<-0
network[which(network!=0)]<-1
1.9 将bulk-cell和single-cell拼接(bulk-cell:single-cell)
(bulk 共有498个sample,single共有4102个cell)
dataset0<-cbind(bulk_dataset[common,],sc_exprs[common,])#将bulk-dataset和sc-dataset中的common拼接(bulk-cell and sample)
1.10 对表达矩阵进行矫正
dataset1 <- normalize.quantiles(dataset0)
rownames(dataset1)<-rownames(dataset0)#行赋值
colnames(dataset1)<-colnames(dataset0)#列赋值
1.11 得到bulk和single的表达矩阵
Expression_bulk <- dataset1[,1:ncol(bulk_dataset)]#bulk-cell的基因表达
Expression_cell <- dataset1[,(ncol(bulk_dataset) + 1):ncol(dataset1)]#single-cell的基因表达
Part 2 Scissor() debug——binomial 后续
2.1 合并bulk基因表达和single基因表达
X<-cor(Expression_bulk,Expression_cell)#bulk基因表达矩阵和single基因表达矩阵的相关性
2.2 质控(卡分位数)
#质控
quality_check <- quantile(X)#分位数
print("|**************************************************|")
print("Performing quality-check for the correlations")
print("The five-number summary of correlations:")
print(quality_check)
print("|**************************************************|")
2.3 50% correlation
#50%分位数小于0.01
if (quality_check[3] < 0.01){
warning("The median correlation between the single-cell and bulk samples is relatively low.")
}
2.4 判断是否有两个label(使用logistic regression)
if (family == "binomial"){
Y <- as.numeric(phenotype)
z <- table(Y)
if (length(z) != length(tag)){
stop("The length differs between tags and phenotypes. Please check Scissor inputs and selected regression type.")
}else{
print(sprintf("Current phenotype contains %d %s and %d %s samples.", z[1], tag[1], z[2], tag[2]))
print("Perform logistic regression on the given phenotypes:")
}
}
2.5 存储RData
#保存当前数据(其中X表示bulk-cell和single-cell表达矩阵的相关性,y表示phenotype的table,其中1表示tp53_mutant,0表示wild-type)
save(X, Y, network, Expression_bulk, Expression_cell, file = 'LUAD_TP53_mutant.RData')
2.6 惩罚参数最小化——APML1 function()
#参数和惩罚系数最小化 加惩罚函数降低过拟合现象
APML1=function(x, y, family=c("gaussian", "binomial", "cox"), penalty=c("Lasso","Enet", "Net"), Omega=NULL, alpha=1.0, lambda=NULL, nlambda=50, rlambda=NULL, wbeta=rep(1,ncol(x)), sgn=rep(1,ncol(x)), nfolds=1, foldid=NULL, inzero=TRUE, isd=FALSE, iysd=FALSE, keep.beta=FALSE, ifast=TRUE, thresh=1e-7, maxit=1e+5) {
#fcall=match.call()
family=match.arg(family)
penalty=match.arg(penalty)
if (penalty=="Net" & is.null(Omega)) {
penalty="Enet"
cat("Enet was performed as no input of Omega")
}
if (penalty %in% c("Enet","Net") & alpha==1.0) {
penalty="Lasso"
cat("Lasso was performed as alpha=1.0")
}
if (alpha!=1.0) {
if (is.null(Omega)) {
penalty="Enet"
} else if (!is.null(Omega)) {
penalty="Net"
}
} else {
penalty="Lasso"
}
wbeta=abs(wbeta)
fit=switch(family,
"gaussian"=LmL0(x,y,Omega,alpha,lambda,nlambda,rlambda,nfolds,foldid,inzero,wbeta,sgn,isd,iysd,keep.beta,thresh,maxit),
"binomial"=LogL0(x,y,Omega,alpha,lambda,nlambda,rlambda,nfolds,foldid,inzero,wbeta,sgn,isd,keep.beta,thresh,maxit,threshP=1e-5),
"cox"=CoxL0(x,y,Omega,alpha,lambda,nlambda,rlambda,nfolds,foldid,inzero,wbeta,sgn,isd,keep.beta,ifast,thresh,maxit))
fit$family=family
#fit$call=fcall
class(fit)="APML1"
return(fit)
}
2.7 选择有信息的cell(informative cells)
for (i in 1:length(alpha)){
set.seed(123)
#惩罚参数最小化
fit0 <- APML1(X, Y, family = family, penalty = "Net", alpha = alpha[i], Omega = network, nlambda = 100, nfolds = min(10,nrow(X)))
fit1 <- APML1(X, Y, family = family, penalty = "Net", alpha = alpha[i], Omega = network, lambda = fit0$lambda.min)
if (family == "binomial"){#当family==binomail时,fit1得到的beta共有4103个
Coefs <- as.numeric(fit1$Beta[2:(ncol(X)+1)])
}else{
Coefs <- as.numeric(fit1$Beta)
}
Cell1 <- colnames(X)[which(Coefs > 0)]#系数大于0的细胞,作为scissor+细胞
Cell2 <- colnames(X)[which(Coefs < 0)]#系数小于0的细胞,作为scissor-细胞
percentage <- (length(Cell1) + length(Cell2)) / ncol(X)#选择的细胞不能太少,cutoff是至少的限制
print(sprintf("alpha = %s", alpha[i]))
print(sprintf("Scissor identified %d Scissor+ cells and %d Scissor- cells.", length(Cell1), length(Cell2)))
print(sprintf("The percentage of selected cell is: %s%%", formatC(percentage*100, format = 'f', digits = 3)))
if (percentage < cutoff){
break
}
cat("\n")
}
#Scissor的最后返回值alpha,lambda,family,coef,pos_cell和neg_cell
list(alpha = alpha[i], lambda = fit0$lambda.min, family = family)
Coefs = Coefs
Scissor_pos = Cell1
Scissor_neg = Cell2
Coefs
Scissor_pos
Scissor_neg
Question?
惩罚系数的计算方法——APML1()函数详细解析