生信碱移
火山图+PPI互作
火山图是差异分析的常见可视化方式,但它携带的信息实在是太少了,甚至没有放进论文组图的必要。在这里,小编考虑到火山图的散点比较多,估摸着能把蛋白互作 (PPI) 信息也一起放进去。
一、数据准备与R包引用
1.R 包引用:
library(ggplot2)
library(ggsci)
library(tidyverse)
library(tidydr)
2.输入数据:
# 差异表格数据读入
Diff <- read.csv(file = "Diff.csv", row.names = 1)
# ppi蛋白交互数据
ppiLink <- read.table("9606.protein.links.v12.0.txt", header = T, sep = " ")
ppiAnn_fristP <- read.table("9606.protein.info.v12.0.txt", header = F, sep = "\t")[, c(1,2)]
colnames(ppiAnn_fristP) <- c("protein1", "geneName1")
ppiAnn_secondP <- read.table("9606.protein.info.v12.0.txt", header = F, sep = "\t")[, c(1,2)]
colnames(ppiAnn_secondP) <- c("protein2", "geneName2")
ppi_all <- merge(ppiLink, ppiAnn_fristP, by = "protein1") %>% merge(ppiAnn_secondP, by = "protein2")
ppi_all <- ppi_all[,c(4, 5, 3)]
head(ppi_all)
rm(ppiLink, ppiAnn_fristP, ppiAnn_secondP)
输入文件中9606开头的文件是来自PPI数据库的蛋白互作信息和注释,在示例文件中同样提供,铁子们可以直接copy使用。另外,示例使用的 Diff.csv 是 DESeq2差异分析的输出表格,其它差异分析方法的表格也基本一致,只需匹配好列名(表头如下):
二、可视化
1.显著差异基因的阈值设置:
#火山图差异标记颜色标准设置
adjPval <- 0.05 # 矫正p值
aflogFC <- 1 # logFC
Significant <- ifelse((Diff$padj<adjPval & abs(Diff$log2FoldChange)>aflogFC), ifelse(Diff$log2FoldChange>aflogFC,"Up","Down"), "Not")
2.数据提取与可视化(见注释):
# 提取在Diff中出现的基因
ppi_line <- ppi_all[which(ppi_all$geneName1%in%rownames(Diff[!Significant=="Not",]) & ppi_all$geneName2%in%rownames(Diff[!Significant=="Not",])), ]
# 循环提取交互信息
line_res <- sapply(as.list(1:nrow(ppi_line)), function(x, Diff, ppi_line){
index1 <- which(rownames(Diff)==ppi_line$geneName1[x])
index2 <- which(rownames(Diff)==ppi_line$geneName2[x])
return(c(Diff$log2FoldChange[index1], -log10(Diff$padj)[index1], Diff$log2FoldChange[index2], -log10(Diff$padj)[index2]))
}, Diff=Diff[!Significant=="Not",], ppi_line=ppi_line)
# 计算DC值
ppi_all <- ppi_all[ppi_all$geneName1 %in% rownames(Diff)[!Significant=="Not"],]
DC <- as.data.frame(table(ppi_all$geneName1))
rownames(DC) <- DC$Var1
DCnum <- ifelse(rownames(Diff) %in% rownames(DC), 1, 0)
DCnum[which(rownames(Diff) %in% rownames(DC))] <- (DC[rownames(Diff)[which(rownames(Diff) %in% rownames(DC))], ])[,2]
Diff$DCscore <- DCnum
Diff$DCscore <- (Diff$DCscore/max(Diff$DCscore))*3
#write.csv(Diff, "Diff_result.csv", row.names = T) # 将每个显著差异基因的dc值储存
# 根据DC值标记名称
for_label <- Diff[!Significant=="Not",]
for_label <- for_label[abs(for_label$log2FoldChange) > 2 & for_label$padj < 1e-64,] # 先根据logFC和padj进一步筛选
for_label <- for_label[order(for_label$DCscore, decreasing = T)[1:20],] # 剩余基因中前20个dc最大的基因
for_label$hubGene <- rownames(for_label)
Diff$hubGene <- rownames(Diff)
# 进行绘图
rownames(line_res) <- c("x", "y", "xend", "yend")
line_res <- t(line_res)%>%as.data.frame()
line_res$combined_score <- (ppi_line$combined_score/max(ppi_line$combined_score))*0.5
#开始绘制
p <- ggplot(Diff, aes(log2FoldChange, -log10(padj)))
p <- p+geom_segment(data=line_res, aes(x=x, y=y, xend=xend, yend=yend), color="yellow3", size=0.3, alpha=0.1)
p <- p+
geom_point(aes(col=Significant, size=DCscore))+
scale_color_manual(values=c(ggsci::pal_nejm()(2)[2], "#838B8B", ggsci::pal_nejm()(2)[1]))+
labs(title = " ")+
theme(plot.title = element_text(size = 16, hjust = 0.5, face = "bold"))+
geom_hline(aes(yintercept=-log10(adjPval)), colour="black", linetype="twodash",size=0.5)+
geom_vline(aes(xintercept=aflogFC), colour="black", linetype="twodash",size=0.5)+
geom_vline(aes(xintercept=-aflogFC), colour="black", linetype="twodash",size=0.5)+
theme_dr(xlength = 0.1, ylength = 0.1) + theme(panel.grid=element_blank())+
scale_size(range = c(0.5, 5))+
ggrepel::geom_text_repel(
aes(label = hubGene),
data = for_label,
color="black",
#label.size =0.01,
#max.overlaps = Inf,
size = 2.5,
force = 10
)
p
#ggsave("Diff_volcanoPlot.pdf", width = 6, height = 5) # 保存文件时需要去除井号键
感兴趣的铁子可以试试