1、GSEA
rm(list = ls())
library(clusterProfiler)
library(org.Hs.eg.db)
input <- read.table("data/GSEA_input.txt",sep="\t",header=T,check.names=F)
head(input)
# 输入包含两列,基因名和logFC值
# ENTREZID logFC
# 1 4312 4.572613
# 2 8318 4.514594
# 3 10874 4.418218
# 4 55143 4.144075
# 5 55388 3.876258
# 6 991 3.677857
dim(input)
# 按照logFC值对基因进行排序
## 1: 提取logFC值,并储存在一个向量中
geneList = input[,2]
## 2: 对geneList进行命名
names(geneList) = as.character(input[,1])
head(geneList)
## 基因名:ENTREZID
# 4312 8318 10874 55143 55388 991
## 基因对应的logFC值
# 4.572613 4.514594 4.418218 4.144075 3.876258 3.677857
## 3: 根据logFC值降序排列
geneList = sort(geneList, decreasing = TRUE)
########## GO的GSEA富集分析:gseGO ##########
gsego <- gseGO(geneList = geneList,#排序后的基因列表,一般根据logFC进行排序
OrgDb = org.Hs.eg.db,
ont = "CC",#可选择bp.MF,CC,ALL
nPerm = 1000,#置换检验的次数,默认1000,保持默认即可
minGSSize = 100,#最小基因集的基因数
maxGSSize = 500,#最大基因集的基因数
pvalueCutoff = 0.05,#p值的阈值
verbose = FALSE)#是否输出提示信息,默认为false
head(gsego)
# 保存GO的GSEA分析结果到文件
write.table(gsego,file="data/GSEA_GO_result.txt",sep="\t",
quote=F,row.names = F)
########## KEGG的GSEA富集分析:gseKEGG ##########
gsekk <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.05,
verbose = FALSE)
head(gsekk)
# 保存KEGG的GSEA分析结果到文件
write.table(gsekk,file="data/GSEA_KEGG_result.txt",sep="\t",
quote=F,row.names = F)
########## MSigDb的GSEA富集分析:GSEA ##########
# MSigDb数据集下载网址
# https://www.gsea-msigdb.org/gsea/downloads.jsp
# gmt文件储存的文件夹名
msigdb_GMTs <- "msigdb_v7.0_GMTs"
# 对应输入基因名,ENTREZID对应Entrez Gene ID
# msigdb.v7.0.symbols.gmt对应Gene Symbol名
# 指定用于GSEA富集分析的gmt文件
msigdb <- "msigdb.v7.0.entrez.gmt"
# 读取上面指定的gmt文件
all_msigdb <- read.gmt(file.path(msigdb_GMTs,msigdb))
gsegmt <- GSEA(geneList, TERM2GENE=all_msigdb, verbose=FALSE)#TERM2GENE,数据集条目名称TERM与基因名的对应关系,一般是两列的数据框格式
head(gsegmt)
# 保存KEGG的GSEA分析结果到文件
write.table(gsegmt,file="data/GSEA_MSigDb_result.txt",sep="\t",
quote=F,row.names = F)
# 将以上所有结果保存为R二进制格式,方便快速加载
save(geneList, gsego, gsekk, gsegmt, file = "data/gsea_result.Rda")
#可视化
# 清空变量
remove(list = ls())
# 加载所需R包
library(clusterProfiler)
library(ggplot2)
# 加载GSEA结果,以便用于后续可视化
load(file = "data/gsea_result.Rda")
########## 气泡图:dotplot ##########
tiff(file="figures/GSEA_GO_dotplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
dotplot(gsego, showCategory=30) + ggtitle("dotplot for GSEA")
dev.off()
dotplot(gsekk, showCategory=30)
dotplot(gsegmt, showCategory=30)
########## 网络图:cnetplot ##########
tiff(file="figures/GSEA_KEGG_cnetplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
cnetplot(gsekk, categorySize="pvalue", foldChange=geneList)
dev.off()
cnetplot(gsekk, foldChange=geneList)
cnetplot(gsekk, foldChange=geneList, circular = TRUE, colorEdge = TRUE)
########## 热图:heatplot ##########
tiff(file="figures/GSEA_GO_heatplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
heatplot(gsego, foldChange=geneList)
dev.off()
########## 富集图:emapplot ##########
tiff(file="figures/GSEA_GO_emapplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
emapplot(gsego, pie_scale=1.5,layout="kk") #enrich map,这种图很少用了
dev.off()
########## 高级venn图:upsetplot ##########
library(ggupset)
library(enrichplot)
tiff(file="figures/GSEA_KEGG_upsetplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
upsetplot(gsekk)
dev.off()
########## GSEA图:gseaplot、gseaplot2、gsearank ##########
## GSEA图:gseaplot
gseaplot(gsekk, geneSetID = 1, by = "runningScore", title = gsekk$Description[1])#runningScore光展示下面一个图
gseaplot(gsekk, geneSetID = 1, by = "preranked", title = gsekk$Description[1])#preranked光展示上面一个图
tiff(file="figures/GSEA_KEGG_gseaplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
gseaplot(gsekk, geneSetID = 1, title = gsekk$Description[1])#geneSetID的1和gsekk$Description[1]的1是对应的
dev.off()
## GSEA图:gseaplot2
gseaplot2(gsekk, geneSetID = 1, title = gsekk$Description[1])
## 在一个gsea图上绘制多个基因集
gseaplot2(gsekk, geneSetID = 1:3)#绘制三个基因集
gseaplot2(gsekk, geneSetID = 1:3, subplots = 1)#绘制第一部分
gseaplot2(gsekk, geneSetID = 1:3, subplots = 1:2)#绘制两部分
tiff(file="figures/GSEA_KEGG_gseaplot2.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
gseaplot2(gsekk, geneSetID = 1:3, subplots = 1:3,#geneSetID,gsekk中富集到的基因集编号,1:3表示前三个
pvalue_table = TRUE,
color = c("#E495A5", "#86B875", "#7DB0DD"), #指定颜色
ES_geom = "dot")#line 连线图;dot虚线图
dev.off()
## GSEA图:gsearank 不常用图
gsearank(gsekk, 1, title = gsekk[1, "Description"])
pp <- lapply(1:3, function(i) {
anno <- gsekk[i, c("NES", "pvalue", "p.adjust")]
lab <- paste0(names(anno), "=", round(anno, 3), collapse="\n")
gsearank(gsekk, i, gsekk[i, 2]) + xlab(NULL) +ylab(NULL) +
annotate("text", 0, gsekk[i, "enrichmentScore"] * .9, label = lab, hjust=0, vjust=0)
})
tiff(file="figures/GSEA_KEGG_gsearank.tiff",
width = 35,height = 40,units ="cm",
compression="lzw",bg="white",res=300)
plot_grid(plotlist=pp, ncol=1)
dev.off()
########## 山脊图:ridgeplot ##########
tiff(file="figures/GSEA_KEGG_ridgeplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
ridgeplot(gsekk)
dev.off()
tiff(file="figures/GSEA_GO_ridgeplot.tiff",
width = 35,height = 22,units ="cm",
compression="lzw",bg="white",res=300)
ridgeplot(gsego)
dev.off()
2、GSVA
# 清空变量
remove(list = ls())
# 读取标准化后的表达矩阵
input <- read.table("data/GSE1297.exp.txt", sep="\t",
header=T, row.names = 1, check.names=F)
exp <- as.matrix(input)
# 加载所需R包
if(!"XML"%in%installed.packages())
install.packages("XML", type = "binary")
library(GSEABase)
library(GSVA)
# 设置gmt文件路径,这次我们用MsigDb数据集
msigdb_GMTs <- "msigdb_v7.0_GMTs"
msigdb <- "msigdb.v7.0.entrez.gmt"
# 获取gmt文件内容
geneset <- getGmt(file.path(msigdb_GMTs, msigdb))
# 进行GSVA分析
es.max <- gsva(exp, geneset,
mx.diff=FALSE, verbose=FALSE,
parallel.sz=1)#线程数
# 查看GSVA结果的前几列
head(es.max[1:10,1:2])
# 写出GSVA结果到文件
write.table(es.max, file="data/GSVA_MsigDb_result.txt",sep="\t",
quote=F,row.names = T)
# 将GSVA结果保存为R二进制格式
save(es.max, file = "data/GSVA_MsigDb_result.Rda")
#可视化
# 清空变量
remove(list = ls())
# 加载GSVA结果
load(file = "data/GSVA_MsigDb_result.Rda")
dim(es.max)
# 分组信息
group_list <- c(rep("untrt", 9), rep("trt",7))#untrt样本排在前,trt样本排在后
table(group_list)
# group_list
# trt untrt
# 7 9
design <- model.matrix(~0+factor(group_list))#设计矩阵,也就是分组矩阵
colnames(design) <- levels(factor(group_list))#设计矩阵的列名,Levels: trt untrt
rownames(design) <- colnames(es.max)#设计矩阵的行名
design
library(limma)#1.lmFit 2.eBayes 3.topTable 三大步骤
contrast.matrix<-makeContrasts("trt-untrt",#比较矩阵,trt和untrt相比
levels = design)#设计矩阵
contrast.matrix
# 拟合limma线性模型
fit <- lmFit(es.max,design)#线性拟合
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)#贝叶斯
# 查看是否差异显著
res <- decideTests(fit2, p.value=0.05)
summary(res)
# trt-untrt
# Down 51
# NotSig 22502
# Up 28
# 提取差异分析结果
tmp <- topTable(fit2)
# 去除包含NA的行
DEG <- na.omit(tmp)
head(DEG[1:4,])
# 将差异分析结果保存到文件
write.table(DEG, file="data/GSVA_DEG_limma_result.txt",sep="\t",
quote=F,row.names = T)
# 将差异分析结果保存为R二进制格式
save(DEG, group_list, file = "data/GSVA_DEG_limma.Rda")
# 绘制差异分析热图
load(file = "data/GSVA_DEG_limma.Rda")
load(file = "data/GSVA_MsigDb_result.Rda")
DEG_sig <- DEG[DEG$P.Value<0.01 & abs(DEG$logFC) > 0.3,]
dat <- es.max[match(rownames(DEG_sig),rownames(es.max)),]
annotation_col <- data.frame(group_list)
rownames(annotation_col) <- colnames(dat)
pheatmap::pheatmap(dat,
width = 20,
height = 11,
annotation_col = annotation_col,
show_colnames = F,
filename = 'figures/GSVA_heatmap.png')
来源:傻傻分不清!GSEA & GSVA有啥差别?史上最全教程来了! - 知乎 (zhihu.com)https://zhuanlan.zhihu.com/p/506912398