前言
主坐标分析(principal co-ordinates analysis,PCoA)是一种非约束性的数据降维分析方法,可用来研究样本群落组成的相似性或相异性。PCoA以样本距离为整体考虑,应用非常广泛。这期,我们用R语言ggplot2来实现PCoA可视化。
引用
https://zhuanlan.zhihu.com/p/389494756 (在线作图丨数据降维方法②——主坐标分析PCoA)
https://www.bilibili.com/video/BV1mU4y1m74B (R语言保姆级教程/PCoA/参数自定义)
还有其他,时间略久,找不到具体网址了…
第一步:加载安装包
library(ade4)
library(ggplot2)
library(RColorBrewer)
library(vegan)
library(plyr)
第二步:导入数据
data('iris') # R语言自带数据集
data <- iris
head(data)
第三步:PCoA分析
data_new <- data[,-5] # 先去掉最后一列的species
data_dist <- vegdist(data_new,method='bray') # 计算距离
data_pcoa <- cmdscale(data_dist,k=(nrow(data_new)-1),eig=TRUE) # PCoA
看到warning message,所以要矫正负特征根。
data_pcoa <- cmdscale(data_dist,k=(nrow(data_new)-1),eig=TRUE,add=TRUE) # 矫正负特征根
data_pcoa_eig <- data_pcoa$eig # 各PCoA轴特征值
data_pcoa_exp <- data_pcoa_eig/sum(data_pcoa_eig) # 各PCoA轴解释量
pcoa1 <- paste(round(100*data_pcoa_exp[1],2),'%') # 前2个轴的解释量
pcoa2 <- paste(round(100*data_pcoa_exp[2],2),'%') # PCoA1:40.38%; PCoA2:6.9%
sample_site <- data.frame(data_pcoa$points)[1:2]
sample_site$level<-factor(data[,5],levels = c('setosa','versicolor','virginica'))
names(sample_site)[1:3] <- c('PCoA1', 'PCoA2','level')
head(sample_site)
summary(sample_site)
在计算出前2个轴的解释量之后,提取data_pcoa$points前2个轴的值,作为坐标点,为画PCoA图做准备。通过summary可知,目前的数据集sample_site共有150条数据,其中level共有3种水平,分别50条数据。
第四步:PCoA作图
有时候作图,碰到的分组情况是Group1、Group2、Group3这种按顺序排好的组别,但是图画出来后的图例顺序却是乱的(如Group1、Group3、Group2这种),这种时候就需要先给定好顺序。
level_order <- c("setosa","versicolor","virginica")
level_order <- factor(1:length(level_order),labels = level_order)
sample_site$level <- factor(sample_site$level,levels = levels(level_order))
PCoA作图需要准备的步骤:
find_hull <- function(sample_site) sample_site[chull(sample_site$PCoA1, sample_site$PCoA2),]
hulls <- ddply(sample_site, "level", find_hull)
hulls[,1]
PCoA1_mean <- tapply(sample_site$PCoA1,sample_site$level, mean)
PCoA2_mean <- tapply(sample_site$PCoA2,sample_site$level, mean)
mean_point <- rbind(PCoA1_mean,PCoA2_mean)
mean_point <- data.frame(mean_point)
t1 <- t(mean_point)
t2 <- as.data.frame(t1)
t2
pcoa_plot <- ggplot(sample_site, aes(PCoA1, PCoA2, color=level,shape=level)) +
theme_classic()+ #去掉背景框
geom_vline(xintercept = 0, color = 'gray', size = 0.4) +
geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
geom_polygon(data = hulls,alpha = 0.2,aes(fill=factor(level),color=level),size=0.2,show.legend = FALSE) +
geom_point(size = 4) + #设置点的透明度、大小
geom_point(aes(t2[1,1], t2[1,2]),size = 0.5, color='royalblue4') +
geom_point(aes(t2[2,1], t2[2,2]),size = 0.5, color='brown2') +
geom_point(aes(t2[3,1], t2[3,2]),size = 0.5, color='seagreen4') +
geom_segment(aes(x=PCoA1,xend=rep(t2[,1],each=50),y=PCoA2,yend=rep(t2[,2],each=50)),size=0.2) +
scale_color_manual(values = c('royalblue4','brown2','seagreen4')) +
scale_fill_manual(values = c('royalblue4','brown2','seagreen4')) +
scale_shape_manual(values = c(20,20,20)) +
scale_x_continuous(limits = c(-0.4,0.3),breaks=round(seq(-0.4,0.3,0.1),2)) +
scale_y_continuous(limits = c(-0.25,0.25),breaks=round(seq(-0.25,0.25,0.1),2)) +
theme(panel.grid = element_line(color = 'black', linetype = 1, size = 0.1),
panel.background = element_rect(color = 'black', fill = 'transparent'),
legend.title=element_blank()
)+labs(x = paste('PCoA1[40.38%]'), y = paste('PCoA2[6.90%]'))
第五步:保存图片
png(filename = 'pcoa_plot.png',width = 2500, height = 2000,res = 300)
pcoa_plot
dev.off()
到此,PCoA图已经画好。没有加置信椭圆,直接以最外围的点连接起来作为最外面边界,并将每一组的中心点位置求出来,使得每一个点都与中心点连接,形成了该PCoA图。
全部代码:
# PCoA 主坐标分析
# PCoA与PCA都是降低数据维度的方法
# 差异在于PCA是基于原始矩阵,而PCoA是基于通过原始矩阵计算出的距离矩阵
## 第一步:加载安装包
library(ade4)
library(ggplot2)
library(RColorBrewer)
library(vegan)
library(plyr)
## 第二步:导入数据
data('iris') # R语言自带数据集
data <- iris
head(data)
## 第三步:PCoA分析
data_new <- data[,-5]
data_dist <- vegdist(data_new,method='bray') # 计算距离
data_pcoa <- cmdscale(data_dist,k=(nrow(data_new)-1),eig=TRUE) # PCoA
data_pcoa <- cmdscale(data_dist,k=(nrow(data_new)-1),eig=TRUE,add=TRUE) # 矫正负特征根
data_pcoa_eig <- data_pcoa$eig # 各PCoA轴特征值
data_pcoa_exp <- data_pcoa_eig/sum(data_pcoa_eig) # 各PCoA轴解释量
pcoa1 <- paste(round(100*data_pcoa_exp[1],2),'%')
pcoa2 <- paste(round(100*data_pcoa_exp[2],2),'%')
sample_site <- data.frame(data_pcoa$points)[1:2]
sample_site$level<-factor(data[,5],levels = c('setosa','versicolor','virginica'))
names(sample_site)[1:3] <- c('PCoA1', 'PCoA2','level')
head(sample_site)
summary(sample_site)
## 第四步:PCoA作图
level_order <- c("setosa","versicolor","virginica") # 确定level顺序
level_order <- factor(1:length(level_order),labels = level_order)
sample_site$level <- factor(sample_site$level,levels = levels(level_order))
find_hull <- function(sample_site) sample_site[chull(sample_site$PCoA1, sample_site$PCoA2),]
hulls <- ddply(sample_site, "level", find_hull)
hulls[,1]
PCoA1_mean <- tapply(sample_site$PCoA1,sample_site$level, mean)
PCoA2_mean <- tapply(sample_site$PCoA2,sample_site$level, mean)
mean_point <- rbind(PCoA1_mean,PCoA2_mean)
mean_point <- data.frame(mean_point)
t1 <- t(mean_point)
t2 <- as.data.frame(t1)
t2
pcoa_plot <- ggplot(sample_site, aes(PCoA1, PCoA2, color=level,shape=level)) +
theme_classic()+ #去掉背景框
geom_vline(xintercept = 0, color = 'gray', size = 0.4) +
geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
geom_polygon(data = hulls,alpha = 0.2,aes(fill=factor(level),color=level),size=0.2,show.legend = FALSE) +
geom_point(size = 4) + #设置改点的透明度、大小
geom_point(aes(t2[1,1], t2[1,2]),size = 0.5, color='royalblue4') +
geom_point(aes(t2[2,1], t2[2,2]),size = 0.5, color='brown2') +
geom_point(aes(t2[3,1], t2[3,2]),size = 0.5, color='seagreen4') +
geom_segment(aes(x=PCoA1,xend=rep(t2[,1],each=50),y=PCoA2,yend=rep(t2[,2],each=50)),size=0.2) +
scale_color_manual(values = c('royalblue4','brown2','seagreen4')) +
scale_fill_manual(values = c('royalblue4','brown2','seagreen4')) +
scale_shape_manual(values = c(20,20,20)) +
scale_x_continuous(limits = c(-0.4,0.3),breaks=round(seq(-0.4,0.3,0.1),2)) +
scale_y_continuous(limits = c(-0.25,0.25),breaks=round(seq(-0.25,0.25,0.1),2)) +
theme(panel.grid = element_line(color = 'black', linetype = 1, size = 0.1),
panel.background = element_rect(color = 'black', fill = 'transparent'),
legend.title=element_blank()
)+labs(x = paste('PCoA1[40.38%]'), y = paste('PCoA2[6.90%]'))
pcoa_plot
png(filename = 'pcoa_plot.png',width = 2500, height = 2000,res = 300)
pcoa_plot
dev.off()