V.PhyloMaker是一款应用于R的资源包,存储了数以万计的维管束植物的系统发生信息。V.PhyloMaker由贵州师范大学生命科学学院金毅教授等人开发,涵盖了74533种维管束植物的基本信息,为想要避免繁琐地查找基因/蛋白质数据的用户构建系统发生树提供了便利。
V.PhyloMaker的代码在github上以源代码的形式公开(jinyizju/V.PhyloMaker),本文接下来的代码部分中的注释,对代码中的函数bind.relative(bind.relative.R)进行了解析。
bind.relative <-
function (sp.list, tree = GBOTB.extended, nodes = nodes.info.1,
output.sp.list = TRUE)
{
nodesN <- nodes
treeX <- tree
# 将因子(factor)变成字符串(character)
sp.list[sapply(sp.list, is.factor)] <- lapply(sp.list[sapply(sp.list,
is.factor)], as.character)
# 物种(使用者提供的物种列表中的物种)去重,并输出
if (any(duplicated(sp.list$species))) {
warning("Duplicated species detected and removed.")
print(sp.list$species[duplicated(sp.list$species)])
}
sp.list <- sp.list[!duplicated(sp.list$species), ]
sp.list.original <- sp.list
# 接下来,使用了正则表达式(perl = TRUE意味着使用Perl规则)对科属种的名称进行了处理
# 将空格“ ”改成“_”
sp.list$species <- gsub(" ", "_", sp.list$species)
# 首字母大写
sp.list$species <- gsub("(^[[:alpha:]])", "\\U\\1", sp.list$species,
perl = TRUE)
sp.list$genus <- gsub("(^[[:alpha:]])", "\\U\\1", sp.list$genus,
perl = TRUE)
sp.list$family <- gsub("(^[[:alpha:]])", "\\U\\1", sp.list$family,
perl = TRUE)
sp.list$species.relative <- gsub(" ", "_", sp.list$species.relative)
# 使用正则表达式,将首字母大写(“^”匹配字符串开头,[[:alpha:]]匹配小写字母)
sp.list$species.relative <- gsub("(^[[:alpha:]])", "\\U\\1",
sp.list$species.relative, perl = TRUE)
sp.list$genus.relative <- gsub("(^[[:alpha:]])", "\\U\\1",
sp.list$genus.relative, perl = TRUE)
# 正则表达式处理完毕
oriN <- tree$node.label
# 重置tree的node.label部分
tree$node.label <- paste("N", 1:tree$Nnode, sep = "")
add <- sp.list[which(is.na(match(sp.list$species, tree$tip.label))),
]
if (dim(add)[1] == 0 & length(na.omit(match(sp.list$species,
tree$tip.label))) == 0)
stop("Incorrect format of species list.")
# 如果sp.list中的所有物种(species)都在树里面,则执行下面的操作
if (length(setdiff(sp.list$species, treeX$tip.label)) == 0 & length(na.omit(match(sp.list$species, tree$tip.label))) > 0)
{
print("All species in sp.list are present on tree.")
splis <- sp.list.original
# 把多余的物种(在树里面,但不在sp.list中的物种)使用drop.tip清理掉
treeX <- drop.tip(treeX, setdiff(treeX$tip.label, sp.list$species))
splis$status <- "prune"
phyloX <- list(phylo = treeX, species.list = splis)
返回修剪(去除掉多余的物种)后的树
return(phyloX)
stop()
}
# 如果sp.list中的所有物种(species)不都在树里面,那么检查sp.list中的亲缘物种(species.relative)
# 这里的f, x, h.sp, fG, xG, h.gen, h都是索引号(index,序号),而不是物种信息
f <- which(sp.list$species %in% tree$tip.label)
x <- which(sp.list$species.relative %in% tree$tip.label)
h.sp <- setdiff(x, f)
fG <- which(sp.list$genus %in% nodes[nodes$level == "G",
]$genus)
xG <- which(sp.list$genus.relative %in% nodes[nodes$level ==
"G", ]$genus)
h.gen <- setdiff(xG, fG)
h.gen <- setdiff(h.gen, union(x, f))
h <- union(h.sp, h.gen)
# 由序号取得物种信息
# sel.sp指的是species.relative在GBOTB.extended,但species本身不在GBOTB.extended中的那些物种条目
sel.sp <- sp.list[h.sp, ]
# sel.gen指的是genus.relative在GBOTB.extended,但genus本身不在GBOTB.extended中的那些物种条目,并且也不包含已经在sel.sp中的那些条目
sel.gen <- sp.list[h.gen, ]
sel.gen <- sel.gen[!duplicated(sel.gen$genus), ]
# 先判断是否为空,避免下面的for循环报错
if (dim(sel.gen)[1] > 0) {
for (i in 1:dim(sel.gen)[1]) {
n <- match(sel.gen$genus.relative[i], nodes$genus)
x <- length(tree$tip.label) + which(tree$node.label ==
nodes$bn[n])
m <- data.frame(level = "G", family = nodes$family[n],
genus = sel.gen$genus[i], rn = nodes$bn[n], rn.bl = nodes$rn.bl[n],
bn = nodes$bn[n], bn.bl = nodes$bn.bl[n], gen.n = 1,
sp.n = 1, taxa = sel.gen$species[i], stringsAsFactors = FALSE)
nodesN <- rbind(nodesN, m)
tree <- at.node(tree, x, sel.gen$species[i])
}
}
# 先判断是否为空,避免下面的for循环报错
if (dim(sel.sp)[1] > 0) {
for (i in 1:dim(sel.sp)[1]) {
n <- which(tree$edge[, 2] == match(sel.sp$species.relative[i],
tree$tip.label))
tree <- at.node(tree, tree$edge[n, 1], sel.sp$species[i])
}
}
tree$edge.length <- as.numeric(tree$edge.length)
tree$node.label <- oriN
status <- rep("", dim(sp.list)[1])
status[h] <- "add to relative"
sp.list.original$status.relative <- status
if (output.sp.list == FALSE)
sp.list <- NULL
phylo <- list(phylo = tree, species.list = sp.list.original,
nodes.info = nodesN)
phylo[sapply(phylo, is.null)] <- NULL
# 输出未剪枝的树
return(phylo)
}