GEO芯片数据处理、多芯片数据合并、差异分析、核心基因提取、核心基因与免疫细胞相关性

R:https://www.r-project.org/
Rstudio:  https://posit.co/products/open-source/rstudio/

一.数据准备

  1. 下载探针矩阵probe.txt和平台注释ann.txt

部分平台注释文件不能下载,解决办法:

  • gset <- getGEO("GSE149507",destdir = ".",getGPL = T)→gset[["GSE149507_series_matrix.txt.gz"]]@featureData@data
  • 下载soft文件:soft <-getGEO(filename="GSE149507_family.soft")→soft@gpls[["GPL23270"]]@dataTable@table#getGEO()可下载网页文件,也可读取soft文件

二. 探针矩阵转换为基因矩阵

!setwd("D:\\immune cell infiltration\\GEO")
getwd()
2.在GEO数据库中下载GSE信息
#install.packages("tidyverse")
library(GEOquery)
!gset <- getGEO(GEO = "GSE124226",destdir = ".",getGPL = F)#destdir="."表示下载到当前路径
e2 <- gset[[1]] 
phe <- e2@phenoData@data  #提取临床信息
exp <- e2@assayData[["exprs"]]  #提取表达矩阵

3.在GEO数据库中下载对应平台信息:(平台信息含探针对应的基因名)  该处ID_symbol处需替换
library(data.table)
ann=fread("ann.txt",sep="\t",header=T,data.table = F)#能在excel中打开的文件都可以用fread函数读取
ID_symbol <- ann[,c("ID","Gene Symbol")]  

4.部分探针对应的基因名有两个(如"DDR1 /// MIR4640" ),将排在后面的去掉 该处ID_symbol、ID_gene_symbol处需替换
symbol <- ID_symbol$`Gene Symbol`
symbol1 <- strsplit(symbol,split=" /// ",fixed=T)#fixed=T表示精确查找
gene_symbol <- sapply(symbol1,function(x){x[1]})#提取每一个子列表中的第一个元素
ID_gene_symbol <- data.frame(ID_symbol$ID,gene_symbol)
rm(ID_symbol,symbol,symbol1,gene_symbol)

5.将表达矩阵与基因名合并  
library(dplyr)
exp <- as.data.frame(exp)#merge函数只对两个data.frame进行合并
exp1 <- merge(ID_gene_symbol,exp,by.x = 1,by.y = 0)
exp2 <- distinct(exp1,gene_symbol,.keep_all=T)
exp3 <- na.omit(exp2)#将缺失的基因名去除
rownames(exp3) <- exp3$gene_symbol
exp4 <- exp3[,-c(1,2)]

6.将对照组样本放在前面并进行批次校正
!control <- c()#对照样本所在列
!treat <- c(seq(1,35,2))##查看tumor样本所在列
exp5 <- exp4[,c(control,treat)]#将对照样本放在前面
!group_list <- c(rep("control",18),rep("treat",18))
group_list <- factor(group_list)
group_list <- relevel(group_list,ref="control")#强制限定顺序,control在前

library(limma)
exp6 <- normalizeBetweenArrays(exp5)#除去批次效应
boxplot(exp5,outline=F,notch=T,col=group_list,las=2)
boxplot(exp6,outline=F,notch=T,col=group_list,las=2)#绘制出的图形相同,说明样本已经进行过归一化处理
rm(ID_gene_symbol,exp,exp1,exp2,exp3,exp4,exp5,control,treat)
至此,我们获得了想要的横坐标为基因名,纵坐标为样本名的表达矩阵exp6

查看数据是否已经取了log2

ex <- exp6
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <-(qx[5]>100)||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2]>0&&qx[2]<1&&qx[4]>1&&qx[4]<2)

if(LogC){ 
ex[which(ex<=0)]<-NaN 
exprSet <- log2(ex) 
print("需要取log2")}else{print("无需取log2")}
rm(ex,exp6,LogC,qx)
exp6 <- exprSet
write.csv(exp6,"GSE34526.NORMALIZE.CSV")

三、多芯片数据合并

  • 准备处理好的多个芯片数据
setwd("D:\\hebing")
getwd()
BiocManager::install("sva")
# 安装包
install.packages("FactoMineR")
install.packages("factoextra")

library(sva)
library(FactoMineR)
library(factoextra)

GSE1=read.csv(file = "GSE34526.NORMALIZE.csv",header = T,row.names = 1)
GSE2=read.csv(file = "GSE137684.NORMALIZE.csv",header = T,row.names = 1)
GSE3=read.csv(file = "GSE80432.NORMALIZE.csv",header = T,row.names = 1)


#GEO数据数据合并
name1 <- intersect(rownames(GSE1),rownames(GSE2))
expr <- cbind(GSE1[name1,],GSE2[name1,])
#若是多个数据(>2),则执行如下命令
name1 <- intersect(rownames(GSE1),rownames(GSE2))
name2 <- intersect(name1,rownames(GSE3))
expr <- cbind(GSE1[name2,],GSE2[name2,],GSE3[name2,])

batch <- c(rep("GSE5090",17),rep("GSE43264",15),rep("GSE6798",29),rep("GSE34526",10),rep("GSE10946",23))
tissue <- c(rep(c("control","treat","control","treat","control","treat","control","treat","control","treat"),c(8,9,7,8,13,16,3,7,11,12)))
mode <- model.matrix(~tissue)

########combat去除批次效应
combat_expr <- ComBat(dat = expr,
                      batch = batch,
                      mod = mode)
#combat_expr就是校正后且合并后的GEO数据

####removeBatchEffect去除批次效应
limma_expr <- removeBatchEffect(expr,batch = batch,design = mode)

rm(expr,GSE1,GSE2,GSE3,name1,name2)

#矫正前PCA分析
pre.pca <- PCA(t(expr),graph = FALSE)#expr1与前文对应
fviz_pca_ind(pre.pca,
             geom= "point",
             col.ind = batch,#batch与前文对应
             addEllipses = TRUE,#添加椭圆
             legend.title="Group"  )#图例名为Group
#此处保存一下图片,方便后续对比

#矫正后PCA分析(1和2都运行,选择百分数值较小的一个进行后续分析)
#1
combat.pca <- PCA(t(combat_expr),graph = FALSE)
fviz_pca_ind(combat.pca,
             geom= "point",
             col.ind = batch,
             addEllipses = TRUE,
             legend.title="Group"  )

#2或
combat.pca <- PCA(t(limma_expr),graph = FALSE)
fviz_pca_ind(combat.pca,
             geom= "point",
             col.ind = batch,
             addEllipses = TRUE,
             legend.title="Group"  )
write.csv(limma_expr,"limma_expr.csv")
write.csv(combat_expr,"combat_expr.csv")

#箱线图查看前后对比
par(mfrow=c(1,2))
boxplot(as.data.frame(expr),main="Original")
boxplot(as.data.frame(combat_expr),main="Batch corrected")

选取较好的矫正方法进行后续分析

 最终得到combat_expr.csv或limma_expr.csv,文件为合并后的GEO数据(行名样本名,列名基因名),样本名顺序按GSE1,GSE2,GSE3排列,每个GSE中对照组样本在前,疾病组样本在后

 四、将多芯片数据整理为对照组在前,疾病组样本在后

tissue <- c(rep(c("control","treat","control","treat","control","treat"),c(3,7,4,8,8,8)))
setwd("C:\\Users\\LENOVO\\Desktop\\wgcna")
limma_expr <- read.table("limma_expr.csv",sep=",",header=T,row.names = 1)
data <- cbind(t(limma_expr),tissue)
conData <- subset(data,tissue=="control")
treatData <- subset(data,tissue=="treat")
conNum <- nrow(conData)
treatNum <- nrow(treatData)
data=rbind(conData,treatData)
data <- t(data[,-15068])
write.csv(data,"limma_expr_control+treat.csv")
rm(conData,treatData,limma_expr)

fenzu=data.frame(Normal=c(rep(1,conNum),rep(0,treatNum)),
                 Tumor=c(rep(0,conNum),rep(1,treatNum)))
row.names(fenzu)=colnames(data)
write.csv(fenzu,"fenzu.csv")#行名样品名,列名Normal和Tumor,是用1表示,否用0表示,分组也可用作临床信息

 最终得到data(limma_expr_control+teart.csv),fenzu.csv

五、差异分析(第六节)

library(limma)
exprSet <- read.csv("limma_expr_control+treat.csv",row.names = 1)
#构建分组矩阵
fenzu <- read.csv("fenzu.csv",row.names = 1)
table(fenzu)
group <- c(rep("con",15),rep("treat",23))
group <- factor(group,levels = c("con","treat"))
design <- model.matrix(~0+group)
colnames(design) <- levels(group)
design

### 2.构建比较矩阵
contrast.matrix <- makeContrasts(treat - con,levels=design)
 

#线性拟合模型构建

fit <- lmFit(exprSet,design)#线性模型拟合
fit <- contrasts.fit(fit, contrast.matrix) 
fit <- eBayes(fit)#贝叶斯检验

### 4.最终得到差异分析结果
allDiff=topTable(fit,number=Inf)


#输出所有基因的差异情况
write.table(allDiff,"allDiff.txt",sep="\t",col.names = NA)
#write.csv(allDiff,"allDiff.csv")

#基因上调和下调设定
data <- allDiff
data$significant <- "stable"
data$significant[data$logFC>=1 & data$P.Value<0.05]="up"
data$significant[data$logFC<=-1 & data$P.Value<0.05]="down"


#输出差异结果
diffSig=data[with(data,(abs(logFC)>1 & P.Value<0.05)),]
write.table(diffSig,"diff_1_0.05.txt",sep="\t",col.names = NA,quote=F)


#筛选差异基因(基础表达量高、变化明显、p值显著)
select.log2FC <- abs(data$logFC)>1
select.Pvalue <- data$P.Value<0.05
select.vec <- select.log2FC & select.Pvalue
table(select.vec)

degs.list <- as.character(rownames(data))[select.vec]  #筛选出的基因名
write.table(degs.list,"DEG.txt",sep="\t",quote=F,row.names=F,col.names = F)

图形绘制

4.1 火山图

图形绘制

library(ggplot2)
library(ggrepel)

#ggplot中的每一个+号代表加一个参数的内容
pdf("volcano.pdf",width = 6,height = 5)
ggplot(data,aes(logFC,-1*log10(P.Value)))+xlim(-2,2)+ylim(0,5)+ #x轴为logFC,p值越小,y值越大
  geom_point(aes(color=significant),size=0.8)+theme_classic()+  #画散点图,size代表点的大小,颜色分类按照significant
  scale_color_manual(values = c("green","black","red"))+  #点的颜色
  geom_hline(yintercept = -log10(0.05),linetype=4,size=0.3)+  #垂直于y轴的线
  geom_vline(xintercept = c(-1,1),linetype=4,size=0.3)+  #垂直于x轴的线
  theme(title = element_text(size=18),text = element_text(size=18))+
  labs(x="log2(foldchange)",y="-log10(P_Value)")
dev.off()

在火山图中标注基因名称

setwd("D:\\limma_expr\\limma差异分析")
BiocManager::install("ggrepel")
library(ggrepel)
data <- read.table("allDiff.txt",sep="\t",header=T,row.names = 1)
data$significant <- "stable"
data$significant[data$logFC>=1 & data$P.Value<0.05]="up"
data$significant[data$logFC<=-1 & data$P.Value<0.05]="down"
label.deg <- c("SYK","FCGR3A","TLR4",
                "TYROBP","FCER1G","CD4","ITGAM","PLCG2","ITGB2","VAV1")
p=ggplot(data,aes(logFC,-1*log10(P.Value)))+xlim(-1.5,2)+ylim(0,5)+
  geom_point(aes(color=significant),size=0.8)+theme_classic()+
  scale_color_manual(values = c("green","black","red"))+
  geom_hline(yintercept = -log10(0.05),linetype=4,size=0.3)+
  geom_vline(xintercept = c(-1,1),linetype=4,size=0.3)+
  theme(title = element_text(size=18),text = element_text(size=18))+
  labs(x="log2(foldchange)",y="-log10(P_Value)")
data_selected <- data[label.deg,] #筛选出的基因所在行

 p +  # 基于普通火山图p
   geom_point(data = data_selected,                             # 添加强调显示的散点,仅限于上调基因
              aes(x = logFC, y = -log10(P.Value)),
              color = 'red3', size = 4.5, alpha = 0.2) +  # 设置散点的颜色、大小和透明度
   geom_label_repel(data = data_selected,                       # 添加带边框的标签,仅限于上调基因
                    aes(x = logFC, y = -log10(P.Value), label = rownames(data_selected)),
                    seed = 233,                           # 设置随机数种子,用于确定标签位置
                    size = 3.5,                           # 设置标签的字体大小
                    color = 'black',                      # 设置标签的字体颜色
                    min.segment.length = 0,               # 始终为标签添加指引线段
                    force = 2,                            # 设置标签重叠时的排斥力
                    force_pull = 2,                       # 设置标签与数据点之间的吸引力
                    box.padding = 0.4,                    # 设置标签周围的填充量
                    max.overlaps = Inf)                   # 设置排斥重叠过多标签的阈值为无穷大,保持始终显示所有标签
 
 

4.2 热图

分组、选基因

!annotation_col1 <- data.frame(
  database <- c(rep("GEO",36)),
  CellType <- c(rep("control",18),rep("treatment",18))
)
rownames(annotation_col1) <- colnames(exp6)
class(exp6)
exp6 <- as.data.frame(exp6)
exprset.map <- exp6[label.deg,]#选取想要绘图的基因

绘图

install.packages("pheatmap")
library(pheatmap)
pheatmap::pheatmap(exprset.map,#热图的数据
                   cluster_rows = F,#行聚类(行间表达相似则聚类)
                   cluster_cols=F,#列聚类,可以看出样本之间的区分度
                   annotation_col = annotation_col1,
                   show_colnames = F,  #不展示样品名
                   scale="row",#以行标准化
                   color=colorRampPalette(c("blue","white","red"))(100))#100个渐变色

4.3 箱线图(某一个基因在对照组和疾病组中的差异)

exp.x=exp6["TTC32",]

z=t(exp.x)#行名为样本名,列为基因表达量
z=as.data.frame(z)
!z$type=c(rep("control",18),rep("treatment",18))

library(ggpubr)
library(ggsignif)
library(ggplot2)

ggboxplot(z,x="type",y="TTC32",
          width = 0.6,fill="type",
          notch = T,palette = c("#00AFBB", "red","#E7B800"),
          add = "jitter",shape="type")

###加个p值
p=ggboxplot(z,x="type",y="TTC32",
            width = 0.6,fill="type",
            notch = T,palette = c("#00AFBB", "red","#E7B800"),
            add = "jitter",shape="type")

p + stat_compare_means(aes(group =type))

若对照组和疾病组来自同一个人,可以在点与点之间连线

!z$pairinfo=pairinfo=rep(1:18,2) 第1个和第19个样本来自同一个人
!ggplot(z, aes(type,TTC32,fill=type)) +
  geom_boxplot() +
  geom_point(size=2, alpha=0.5) +
  geom_line(aes(group=pairinfo), colour="black", linetype="11") +
  xlab("") +
  ylab(paste("Expression of ","TTC32"))+
  theme_classic()+
  theme(legend.position = "none")

4.4 小提琴图

library(ggsignif)
compaired <- list(c("control", "treatment"))

library(ggsci)
ggplot(z,aes(type,TTC32,fill=type)) +
  geom_violin()+
  geom_signif(comparisons =compaired ,step_increase = 0.1,map_signif_level = c("***"=0.001, "**"=0.01, "*"=0.05),test =t.test,size=2,textsize = 6)+
  geom_jitter(shape=16, position=position_jitter(0.2))

六、 富集分析 (第九节)

5.1 

#筛选差异基因(基础表达量高、变化明显、p值显著)
select.FPKM <- data$AveExpr>5  #筛选表达量>5的基因
select.log2FC <- abs(data$logFC)>1
select.Pvalue <- data$P.Value<0.05
select.vec <- select.FPKM & select.log2FC & select.Pvalue
table(select.vec)

degs.list <- as.character(rownames(data))[select.vec]  #筛选出的基因名

5.1.1 GO

##BP、CC、MF改ont值即可
library(org.Hs.eg.db)
library(clusterProfiler)
erich.go.BP = enrichGO(gene =degs.list,
                       OrgDb = org.Hs.eg.db,
                       keyType = "SYMBOL",
                       ont = "BP",
                       pvalueCutoff = 0.05,
                       qvalueCutoff = 0.05)
dotplot(erich.go.BP,showCategory = 8)#展示8个通路#横坐标为GeneRatio,代表差异基因占通路的百分比
#或barplot(erich.go.BP,showCategory = 8)

erich.go.BP=erich.go.BP@result#通路信息
write.table(erich.go.BP,"erich.go.BP.deg.con.hf.txt",sep = "\t",col.names = NA)

 详细版

setwd("")

#install.packages("colorspace")
#install.packages("stringi")
#install.packages("ggplot2")
#install.packages("circlize")
#install.packages("RColorBrewer")

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("org.Hs.eg.db")
#BiocManager::install("DOSE")
#BiocManager::install("clusterProfiler")
#BiocManager::install("enrichplot")
#BiocManager::install("ComplexHeatmap")



library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(circlize)
library(RColorBrewer)
library(dplyr)
library(ComplexHeatmap)

pvalueFilter=0.05    #p值过滤条件   
adjPvalFilter=0.05   #矫正后的p值过滤条件  

#定义颜色
colorSel="p.adjust"
if(adjPvalFilter>0.05){
	colorSel="pvalue"
}
    
rt=read.table("interGenes.Type.txt", header=T, sep="\t", check.names=F)     #??ȡ?????ļ?

#提取差异基因的名称,并将其转换为entrezID
genes=unique(as.vector(rt[,1]))
entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
entrezIDs=as.character(entrezIDs)
rt=cbind(rt, entrezIDs)
rt=rt[rt[,"entrezIDs"]!="NA",]    #去除entrezID为NA的基因
gene=rt$entrezID
#gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)
geneFC=rt$logFC
names(geneFC)=gene

#GO富集分析
kk=enrichGO(gene=gene, OrgDb=org.Hs.eg.db, pvalueCutoff=1, qvalueCutoff=1, ont="all", readable=T)#ont="all"表示同时进行BP,CC,MF
GO=as.data.frame(kk)
GO=GO[(GO$pvalue<pvalueFilter & GO$p.adjust<adjPvalFilter),]
write.table(GO, file="GO.txt", sep="\t", quote=F, row.names = F)

#柱状图
pdf(file="barplot.pdf", width=8.5, height=7)
bar=barplot(kk, drop=TRUE, showCategory=10, label_format=130, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
print(bar)
dev.off()
		
#气泡图
pdf(file="bubble.pdf", width=8.5, height=7)
bub=dotplot(kk, showCategory=10, orderBy="GeneRatio", label_format=130, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
print(bub)
dev.off()

#基因和功能的关系图
pdf(file="cnetplot.pdf", width=8, height=5.25)
cnet=cnetplot(kk, circular=TRUE, foldChange=geneFC, showCategory=5, colorEdge=TRUE)
print(cnet)
dev.off()


###########GO圈图#############################
ontology.col=c("#00AFBB", "#E7B800", "#90EE90")
data=GO[order(GO$pvalue),]
datasig=data[data$pvalue<0.05,,drop=F]
BP = datasig[datasig$ONTOLOGY=="BP",,drop=F]
CC = datasig[datasig$ONTOLOGY=="CC",,drop=F]
MF = datasig[datasig$ONTOLOGY=="MF",,drop=F]
BP = head(BP,6)
CC = head(CC,6)
MF = head(MF,6)
data = rbind(BP,CC,MF)
main.col = ontology.col[as.numeric(as.factor(data$ONTOLOGY))]

#整理圈图数据
BgGene = as.numeric(sapply(strsplit(data$BgRatio,"/"),'[',1))
Gene = as.numeric(sapply(strsplit(data$GeneRatio,'/'),'[',1))
ratio = Gene/BgGene
logpvalue = -log(data$pvalue,10)
logpvalue.col = brewer.pal(n = 8, name = "Reds")
f = colorRamp2(breaks = c(0,2,4,6,8,10,15,20), colors = logpvalue.col)
BgGene.col = f(logpvalue)
df = data.frame(GO=data$ID,start=1,end=max(BgGene))
rownames(df) = df$GO
bed2 = data.frame(GO=data$ID,start=1,end=BgGene,BgGene=BgGene,BgGene.col=BgGene.col)
bed3 = data.frame(GO=data$ID,start=1,end=Gene,BgGene=Gene)
bed4 = data.frame(GO=data$ID,start=1,end=max(BgGene),ratio=ratio,col=main.col)
bed4$ratio = bed4$ratio/max(bed4$ratio)*9.5

#绘制圈图主体部分
pdf("GO.circlize.pdf",width=10,height=10)
par(omi=c(0.1,0.1,0.1,1.5))
circos.par(track.margin=c(0.01,0.01))
circos.genomicInitialize(df,plotType="none")
circos.trackPlotRegion(ylim = c(0, 1), panel.fun = function(x, y) {
  sector.index = get.cell.meta.data("sector.index")
  xlim = get.cell.meta.data("xlim")
  ylim = get.cell.meta.data("ylim")
  circos.text(mean(xlim), mean(ylim), sector.index, cex = 0.8, facing = "bending.inside", niceFacing = TRUE)
}, track.height = 0.08, bg.border = NA,bg.col = main.col)

for(si in get.all.sector.index()) {
  circos.axis(h = "top", labels.cex = 0.6, sector.index = si,track.index = 1,
              major.at=seq(0,max(BgGene),by=100),labels.facing = "clockwise")
}
f = colorRamp2(breaks = c(-1, 0, 1), colors = c("green", "black", "red"))
circos.genomicTrack(bed2, ylim = c(0, 1),track.height = 0.1,bg.border="white",
                    panel.fun = function(region, value, ...) {
                      i = getI(...)
                      circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = value[,2], 
                                         border = NA, ...)
                      circos.genomicText(region, value, y = 0.4, labels = value[,1], adj=0,cex=0.8,...)
                    })
circos.genomicTrack(bed3, ylim = c(0, 1),track.height = 0.1,bg.border="white",
                    panel.fun = function(region, value, ...) {
                      i = getI(...)
                      circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = '#BA55D3', 
                                         border = NA, ...)
                      circos.genomicText(region, value, y = 0.4, labels = value[,1], cex=0.9,adj=0,...)
                    })
circos.genomicTrack(bed4, ylim = c(0, 10),track.height = 0.35,bg.border="white",bg.col="grey90",
                    panel.fun = function(region, value, ...) {
                      cell.xlim = get.cell.meta.data("cell.xlim")
                      cell.ylim = get.cell.meta.data("cell.ylim")
                      for(j in 1:9) {
                        y = cell.ylim[1] + (cell.ylim[2]-cell.ylim[1])/10*j
                        circos.lines(cell.xlim, c(y, y), col = "#FFFFFF", lwd = 0.3)
                      }
                      circos.genomicRect(region, value, ytop = 0, ybottom = value[,1], col = value[,2], 
                                         border = NA, ...)
                      #circos.genomicText(region, value, y = 0.3, labels = value[,1], ...)
                    })
circos.clear()
#绘制圈图中间的图例
middle.legend = Legend(
  labels = c('Number of Genes','Number of Select','Rich Factor(0-1)'),
  type="points",pch=c(15,15,17),legend_gp = gpar(col=c('pink','#BA55D3',ontology.col[1])),
  title="",nrow=3,size= unit(3, "mm")
)
circle_size = unit(1, "snpc")
draw(middle.legend,x=circle_size*0.42)
#绘制GO分类的图例
main.legend = Legend(
  labels = c("Biological Process", "Cellular Component", "Molecular Function"),  type="points",pch=15,
  legend_gp = gpar(col=ontology.col), title_position = "topcenter",
  title = "ONTOLOGY", nrow = 3,size = unit(3, "mm"),grid_height = unit(5, "mm"),
  grid_width = unit(5, "mm")
)
#绘制富集显著性pvalue的图例
logp.legend = Legend(
  labels=c('(0,2]','(2,4]','(4,6]','(6,8]','(8,10]','(10,15]','(15,20]','>=20'),
  type="points",pch=16,legend_gp=gpar(col=logpvalue.col),title="-log10(Pvalue)",
  title_position = "topcenter",grid_height = unit(5, "mm"),grid_width = unit(5, "mm"),
  size = unit(3, "mm")
)
lgd = packLegend(main.legend,logp.legend)
circle_size = unit(1, "snpc")
print(circle_size)
draw(lgd, x = circle_size*0.85, y=circle_size*0.55,just = "left")
dev.off()

5.1.2 KEGG(需要将基因名称转换为entrez_id)

DEG.entrez_id = mapIds(x = org.Hs.eg.db,
                       keys =  degs.list,
                       keytype = "SYMBOL",
                       column = "ENTREZID")

erich.kegg.res <- enrichKEGG(gene = DEG.entrez_id,
                             organism = "hsa",#组织名称,hsa表示人
                             keyType = "kegg")

dotplot(erich.kegg.res)

zzh=erich.kegg.res@result
write.table(zzh,"erich.kegg.res.6.19.txt",sep = "\t",col.names = NA)

详细版

setwd("")     
#install.packages("colorspace")
#install.packages("stringi")
#install.packages("ggplot2")

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("org.Hs.eg.db")
#BiocManager::install("DOSE")
#BiocManager::install("clusterProfiler")
#BiocManager::install("enrichplot")



library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")

pvalueFilter=0.05     
adjPvalFilter=0.05 

#定义图形的颜色
colorSel="p.adjust"
if(adjPvalFilter>0.05){
	colorSel="pvalue"
}


rt=read.table("interGenes.Type.txt", header=T, sep="\t", check.names=F)     #??ȡ?????ļ?

#提取差异基因名称,将基因名转换为entrezID
genes=unique(as.vector(rt[,1]))
entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
entrezIDs=as.character(entrezIDs)
rt=cbind(rt, entrezIDs)
rt=rt[rt[,"entrezIDs"]!="NA",]     
gene=rt$entrezID
#gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)
geneFC=rt$logFC
names(geneFC)=gene

#kegg富集分析
kk <- enrichKEGG(gene=gene, organism="hsa", pvalueCutoff=1, qvalueCutoff=1)
KEGG=as.data.frame(kk)
KEGG$geneID=as.character(sapply(KEGG$geneID,function(x)paste(rt$genes[match(strsplit(x,"/")[[1]],as.character(rt$entrezID))],collapse="/")))
KEGG=KEGG[(KEGG$pvalue<pvalueFilter & KEGG$p.adjust<adjPvalFilter),]
#输出显著富集的结果
write.table(KEGG, file="KEGG.txt", sep="\t", quote=F, row.names = F)

#定义显示通路的数目
showNum=30
if(nrow(KEGG)<showNum){
	showNum=nrow(KEGG)
}

#柱状图
pdf(file="barplot.pdf", width=8, height=7)
barplot(kk, drop=TRUE, showCategory=showNum, label_format=100, color=colorSel)
dev.off()

#气泡图
pdf(file="bubble.pdf", width=8, height=7)
dotplot(kk, showCategory=showNum, orderBy="GeneRatio", label_format=100, color=colorSel)
dev.off()

#基因和通路的关系图
pdf(file="cnetplot.pdf", width=9, height=5.25)
kkx=setReadable(kk, 'org.Hs.eg.db', 'ENTREZID')
cnet=cnetplot(kkx, circular=TRUE, foldChange=geneFC, showCategory=5, colorEdge=TRUE)
print(cnet)
dev.off()

5.1.3  挑选感兴趣的通路进行绘图(以erich.kegg.res.6.19.txt为例)

  • 用excel打开文档,复制粘贴感兴趣的通路包含列名到新文档1.27.txt,打开新的Rstudio
options(stringsAsFactors = FALSE)
rm(list=ls())
setwd("D:\\immune cell infiltration\\GEO")
kegg=read.table("1.27.txt",header = T,sep = "\t")
k = data.frame(kegg)
library(ggplot2)
library(dplyr)

#将GeneRatio转换为小数点样式
x=k$GeneRatio
x1=as.character(x)
a=strsplit(x1,split="/",fixed=T)
be=sapply(a,function(x){x[1]})
be1=as.numeric(be)
after=sapply(a,function(x){x[2]})
after=as.numeric(after)
?strsplit
k$GeneRatio = be1/after
rm(x,x1,a,be,be1,after)
#或者
before <- as.numeric(sub("/\\d+$",replacement = "", k$GeneRatio))
after <- as.numeric(sub("^\\d+/", "", k$GeneRatio))

font.size =10

k %>% 
  ## 对进行p值排序
  arrange(p.adjust) %>% 
  ##指定富集的通路数目(10个)
  slice(1:10) %>% 
  ## 开始ggplot2 作图,其中fct_reorder调整因子level的顺序
  ggplot(aes(GeneRatio,forcats::fct_reorder(Description,Count)))+ 
  ## 画出点图,size=Count代表气泡越大,count数越高
  geom_point(aes(color=p.adjust, size = Count)) +
  ## 调整颜色,guide_colorbar调整色图的方向
  scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE))+
  ## 调整泡泡的大小
  scale_size_continuous(range=c(3, 8))+
  ## 如果用ylab("")或出现左侧空白
  labs(y=NULL) +
  ## 如果没有这一句,上方会到顶
  ggtitle("")+
  ## 设定主题
  theme_bw() +
  theme(axis.text.x = element_text(colour = "black",
                                   size = font.size, vjust =1 ),
        axis.text.y = element_text(colour = "black",
                                   size = font.size, hjust =1 ),
        axis.title = element_text(margin=margin(10, 5, 0, 0),
                                  color = "black",size = font.size),
        axis.title.y = element_text(angle=90))

5.2 GSEA

法一

select.FPKM <- data$AveExpr>5  #筛选表达量>5的基因
table(select.FPKM)
gs.list <- as.character(rownames(data))[select.FPKM]  #筛选出的基因名
f1=data[gs.list,]
f1$symbol <- rownames(f1)
exp.fc <- f1[,c(8,1)]#只含有基因名和logFC值
rm(f1)
###id转化
expm.id <- bitr(exp.fc$symbol, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
exp.fc.id <- merge(exp.fc, expm.id,by.x="symbol", by.y="SYMBOL", all=F)
exp.fc.id=na.omit(exp.fc.id)
exp.fc.id.sorted <- exp.fc.id[order(exp.fc.id$logFC, decreasing = T),]

id.fc <- exp.fc.id.sorted$logFC
names(id.fc) <- exp.fc.id.sorted$ENTREZID
head(id.fc)
rm(expm.id,exp.fc.id,exp.fc.id.sorted)
gsea <- gseKEGG(id.fc, organism = "hsa",pvalueCutoff = 0.5)
go_result <- gseGO(geneList = id.fc,
                   ont = "BP",
                   OrgDb = org.Hs.eg.db,
                   pvalueCutoff = 0.5)


####查看一下富集结果(NES绝对值大于1.5、p.adjust<0.05富集效果较好)
View(go_result@result)

gseaplot2(gsea, geneSetID = 1,title = "abc")#绘制gsea中的第一条通路
gseaplot2(gsea, geneSetID = c(2,3))#第二和第三条通路

法二

select.FPKM <- data$AveExpr>5  #筛选表达量>5的基因
table(select.FPKM)
gs.list <- as.character(rownames(data))[select.FPKM]  #筛选出的基因名
f1=data[gs.list,]
f1$symbol <- rownames(f1)
f1 <- distinct(f1,symbol,.keep_all = T)
exp.fc <- f1[,c(8,1)]#只含有基因名和logFC值
rm(f1)

deg = exp.fc$logFC
names(deg) = exp.fc$symbol
deg= sort(deg,decreasing = T)
head(deg)
install.packages("msigdbr")
library(msigdbr)
h.kegg= msigdbr(species ="Homo sapiens",category = "C2",subcategory = "KEGG")
length(unique(h.kegg$gs_exact_source))
h.kegg= h.kegg[,c(3,4)]
gsea <- GSEA(deg,pvalueCutoff = 0.5, TERM2GENE =h.kegg)
gseaplot2(gsea, geneSetID = 6, title = gsea$Description[6])
gsea1=gsea@result

write.table(gsea1,"gsea.res.txt",sep = "\t",col.names = NA)

七、核心基因(cytohubba12个打分类型的交集基因)

  • 利用cytoscape(插件为cytoHubba):12个打分类型,每个打分类型分别找出打分最高的基因,取交集所得基因即为核心基因,若找不到核心基因,则删掉几个打分类型
  • 准备输入文件:从string网站获得的网络关系文件
  • 分析完成后得到评分文件score.csv
install.packages("UpSetR")
library(UpSetR) 
setwd("D:\\immune cell infiltration\\multiarray") 
rt=read.csv("score.csv", header=T, sep=",", check.names=F, row.names=1)
colnames(rt)=c(colnames(rt)[2:ncol(rt)], "N")
scoreType=c("MCC", "DMNC", "MNC", "Degree", "EPC","BottleNeck", "EcCentricity","Closeness", "Radiality", "Betweenness", "Stress","ClusteringCoefficient")

#对打分进行排序
upsetList=c()
for(i in scoreType){
	data=rt[order(rt[,i],decreasing=T),]
	hubGene=row.names(data)[1:50]   #筛选打分最高的前50个基因
	upsetList[[i]]=hubGene
}
upsetData=fromList(upsetList)


#输出图形
pdf(file="upset.pdf", width=9, height=6, onefile=FALSE)
upset(upsetData,
      nsets = length(scoreType),    #展示打分类型个数
      nintersects = 50,             #展示基因集数目
      order.by = "freq",            #排序的方式(按照基因数目排序)
      show.numbers = "yes",         #ֵ柱状图上方是否显示数值
      number.angles = 20,           #字体角度
      point.size = 1.5,             #点的大小
      matrix.color="red",           #点的颜色
      line.size = 0.8,              #线条粗细
      mainbar.y.label = "Gene Intersections", 
      sets.x.label = "Set Size")
dev.off()

#获取交集基因,作为网络的核心基因
intersectGenes=Reduce(intersect, upsetList)
intersectGenes=as.data.frame(intersectGenes)
colnames(intersectGenes)="Gene"
write.csv(file="hubGenes.csv", intersectGenes, quote=F, row.names=F)

核心基因可视化

热图

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("limma")

#install.packages("pheatmap")
library(limma)
library(pheatmap)
setwd("")     

exp=read.table("exp6.csv", header=T, sep=",", check.names=F,row.names = 1)
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)

geneRT=read.csv("hubGenes.csv", header=T, sep=",", check.names=F)
data=data[as.vector(geneRT[,1]),]#核心基因的表达矩阵
annotation_col1 <- data.frame(
  database <- c(rep("GEO124226",8)),
  Type <- c(rep("control",4),rep("treatment",4))
)
rownames(annotation_col1) <- colnames(data)
colnames(annotation_col1) <- c("database","Type")
#绘制核心基因热图
pdf(file="hubGene.heatmap.pdf", width=7.5, height=3.5)
pheatmap(data, 
         annotation_col =annotation_col1, 
         color = colorRampPalette(c(rep("blue",2), "white", rep("red",2)))(50),
         cluster_cols =F,
         show_colnames = F,
         scale="row",
         fontsize = 8,
         fontsize_row=7,
         fontsize_col=8)
dev.off()

7.1 核心基因绘制ROC曲线

#install.packages("glmnet")
#install.packages("pROC")

library(glmnet)
library(pROC)

expFile="limma_expr_control+treat.csv"    
geneFile="cytohubba_0.9_10.csv"   #核心基因      
setwd("D:\\limma_expr\\WGCNA1")   

rt=read.table(expFile, header=T, sep=",", check.names=F, row.names=1)
geneRT=read.table(geneFile, header=T, sep=",", check.names=F)
geneRT <- data.frame(geneRT$Name)
#获取表达矩阵样品信息
y <- c(rep(0,15),rep(1,23))


#对核心基因进行循环,绘制ROC曲线
bioCol=rainbow(nrow(geneRT), s=0.9, v=0.9)    #定义图形颜色
aucText=c()
k=0
for(x in as.vector(geneRT[,1])){
	k=k+1
	绘制ROC曲线
	roc1=roc(y, as.numeric(rt[x,]))     #得到ROC曲线的参数
	if(k==1){
		pdf(file="ROC.genes.pdf", width=5, height=4.75)
		plot(roc1, print.auc=F, col=bioCol[k], legacy.axes=T, main="")
		aucText=c(aucText, paste0(x,", AUC=",sprintf("%.3f",roc1$auc[1])))
	}else{
		plot(roc1, print.auc=F, col=bioCol[k], legacy.axes=T, main="", add=TRUE)
		aucText=c(aucText, paste0(x,", AUC=",sprintf("%.3f",roc1$auc[1])))
	}
}
#绘制图例,得到ROC曲线下面积
legend("bottomright", aucText, lwd=2, bty="n", col=bioCol[1:(ncol(rt)-1)])
dev.off()

#构建逻辑模型
rt=rt[as.vector(geneRT[,1]),]      #提取核心基因表达量
rt=as.data.frame(t(rt))
logit=glm(y ~ ., family=binomial(link='logit'), data=rt)
pred=predict(logit, newx=rt)     #得到模型的打分

#绘制模型的ROC曲线
roc1=roc(y, as.numeric(pred))      #得到模型ROC曲线的参数
ci1=ci.auc(roc1, method="bootstrap")     #ROC曲线下面积的波动范围
ciVec=as.numeric(ci1)
pdf(file="ROC.model.pdf", width=5, height=4.75)
plot(roc1, print.auc=TRUE, col="red", legacy.axes=T, main="Model")
text(0.39, 0.43, paste0("95% CI: ",sprintf("%.03f",ciVec[1]),"-",sprintf("%.03f",ciVec[3])), col="red")
dev.off()

八、免疫细胞浸润分析与核心基因联合分析

准备矫正后的合并文件combat_expr.csv或limma_expr.csv
cytoscape评分结果文件hubGenes.csv,免疫细胞结果文件cibersort_result.txt和Diff.cell.txt

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("limma")

#install.packages("tidyverse")
#install.packages("ggplot2")
#install.packages("reshape2")
#install.packages("ggpubr")
install.packages("ggExtra")
#install.packages("corrplot")

library(limma)
library(reshape2)
library(tidyverse)
library(ggplot2)
library(ggpubr)
library(ggExtra)
library(corrplot)

corFilter=0.4                       #相关系数过滤文件
pvalueFilter=0.001                  #相关性检验pvalue过滤文件


exp=read.table("limma_expr.csv", header=T, sep=",", check.names=F,row.names = 1)
exp=as.matrix(exp)
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)

#提取核心基因的表达量
geneRT=read.csv("hubGenes.csv", header=T, sep=",", check.names=F)
data=data[as.vector(geneRT[,1]),]#行名基因名,列名样本名

#去除对照组样品
group <- data.frame(sample=character(38),group=character(38))
group$sample <- colnames(exp)
tissue <- c(rep(c("control","treat","control","treat","control","treat"),c(3,7,4,8,8,8)))
group$group <- tissue
data <- cbind(t(data),group)
data=data[data$group=="treat",]#行名为样本名,列名为基因名
data <- data[,-c(13,14)]
data <- as.matrix(data)


#读取核心免疫细胞的含量
immune=read.table("cibersort_result.txt", header=T, sep="\t", check.names=F, row.names=1)
cellRT=read.csv("Diff.cell.txt", header=F, sep="\t", check.names=F)
immune=immune[,as.vector(cellRT[,1])]#行名样本名,列名为免疫细胞名
#提取基因表达矩阵和免疫浸润结果文件中都包含的样本信息
sameSample=intersect(row.names(data), row.names(immune))
data=data[sameSample,,drop=F]
immune=immune[sameSample,,drop=F]#都只含实验组样品

#相关性分析
outTab=data.frame()
#对免疫细胞进行循环
for(cell in colnames(immune)){
  if(sd(immune[,cell])==0){next}
  #对核心基因进行循环
  for(gene in colnames(data)){
    x=as.numeric(data[,gene])
    y=as.numeric(immune[,cell])
    corT=cor.test(x,y,method="spearman")
    cor=corT$estimate
    pvalue=corT$p.value
    #对满足条件的免疫细胞进行可视化,绘制相关性图形
    if((abs(cor)>corFilter) & (pvalue<pvalueFilter)){
      df1=as.data.frame(cbind(x,y))
      p1=ggplot(df1, aes(x, y)) + 
        xlab(paste0(gene, " expression")) + ylab(cell)+
        geom_point() + geom_smooth(method="lm",formula = y ~ x) + theme_bw()+
        stat_cor(method = 'spearman', aes(x =x, y =y))
      #相关性图形
      pdf(file=paste0("cor.", gene, "_", cell, ".pdf"), width=5, height=4.6)
      print(p1)
      dev.off()
    }
  }
}

#相关性热图ͼ
data=cbind(data, immune)#行名为实验组样品名,列名为基因名和免疫细胞名
write.csv(data,"data")
a <- read.csv("data",header=T,row.names = 1)
pdf("corHeatmap.pdf", width=9, height=9)
corrplot(corr=cor(a),
         method = "color",          #图形的展示形式
         order = "original",        #免疫细胞的排序方式
         tl.col="black",            #字体颜色
         addCoef.col = "black",     #相关系数字体颜色
         number.cex = 0.8,          #相关系数字体大小
         col=colorRampPalette(c("blue", "white", "red"))(50),     #ͼ????ɫ??????
)
dev.off()

  • 27
    点赞
  • 50
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
差异表达分析是一种常用的方法,用于比较不同条件下基因表达水平的变化。GEOquery和limma是R语言中广泛使用的两个包,可用于处理和分析芯片数据差异表达。下面是使用GEOquery和limma进行差异表达分析的步骤: 1. 下载和导入芯片数据 使用GEOquery包中的getGEO函数下载芯片数据并导入到R中。例如,如果您要下载GSE12345数据集,可以使用以下代码: ``` library(GEOquery) gset <- getGEO("GSE12345") ``` 2. 数据质量控制 在进行差异表达分析之前,需要对数据进行质量控制。使用GEOquery包中的summary函数和plotPCA函数可以对芯片数据进行基本的质量控制。例如,可以使用以下代码绘制PCA图: ``` library(limma) library(GEOquery) gset <- getGEO("GSE12345") edata <- exprs(gset[[1]]) edata <- t(edata) edata <- na.omit(edata) fit <- prcomp(edata) plotPCA(fit) ``` 3. 数据预处理 对芯片数据进行归一化和标准化处理,以消除不同芯片之间的差异,并确保数据符合正态分布。常用的预处理方法包括RMA、GCRMA、MAS5等。使用limma包中的normalizeBetweenArrays函数可以对芯片数据进行预处理。例如,可以使用以下代码对芯片数据进行RMA预处理: ``` library(limma) library(GEOquery) gset <- getGEO("GSE12345") edata <- exprs(gset[[1]]) edata <- t(edata) edata <- na.omit(edata) edata <- backgroundCorrect.RMA(edata) edata <- normalize.quantiles.RMA(edata) edata <- log2(edata) ``` 4. 差异表达分析 使用limma包中的lmFit函数和eBayes函数可以进行差异表达分析。lmFit函数用于拟合线性模型,eBayes函数用于对差异表达结果进行统计显著性检验。例如,可以使用以下代码进行差异表达分析: ``` library(limma) library(GEOquery) gset <- getGEO("GSE12345") edata <- exprs(gset[[1]]) edata <- t(edata) edata <- na.omit(edata) edata <- backgroundCorrect.RMA(edata) edata <- normalize.quantiles.RMA(edata) edata <- log2(edata) factors <- c(0,0,0,1,1,1) design <- model.matrix(~factors) fit <- lmFit(edata, design) fit <- eBayes(fit) results <- topTable(fit, adjust="BH", sort.by="P", n=1000) ``` 上述代码中,factors表示芯片数据中不同样本的分组信息,design表示设计矩阵,fit表示拟合的线性模型,results表示差异表达结果。 5. 结果分析 根据差异表达结果,可以进行进一步的功能分析、通路分析等。常用的工具包括ClusterProfiler、GOstats、KEGGprofile等。例如,可以使用以下代码进行GO分析: ``` library(ClusterProfiler) gset <- getGEO("GSE12345") edata <- exprs(gset[[1]]) edata <- t(edata) edata <- na.omit(edata) edata <- backgroundCorrect.RMA(edata) edata <- normalize.quantiles.RMA(edata) edata <- log2(edata) factors <- c(0,0,0,1,1,1) design <- model.matrix(~factors) fit <- lmFit(edata, design) fit <- eBayes(fit) results <- topTable(fit, adjust="BH", sort.by="P", n=1000) genes <- rownames(results) geneList <- names(genes) geneList <- names(genes)[abs(results$logFC) > 1 & results$adj.P.Val < 0.05] ego <- enrichGO(geneList, OrgDb="org.Hs.eg.db", ont="BP") barplot(ego) ``` 上述代码中,geneList表示差异表达基因列表,ego表示进行GO分析所得到的富集结果。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值