如何获取mostly单拷贝和strictly单拷贝基因

最新推荐文章于 2024-09-26 08:05:06 发布

让学习成为一种生活方式

最新推荐文章于 2024-09-26 08:05:06 发布

阅读量462

点赞数 8

分类专栏：基因组生物信息学系统进化建树文章标签：单拷贝基因脚本 github

本文链接：https://blog.csdn.net/weixin_44874487/article/details/141431912

版权

生物信息学同时被 3 个专栏收录

75 篇文章 8 订阅

订阅专栏

基因组

57 篇文章 5 订阅

订阅专栏

系统进化建树

10 篇文章 1 订阅

订阅专栏

参考这篇文章的有意思的分析脚本~

如何严格定义单拷贝基因的类型，值得思考。

Insights into angiosperm evolution, floral development and chemical biosynthesis from the Aristolochia fimbriata genome

https://github.com/yihenghu/Aristolochia_fimbriata_genome_analysis    #脚本github地址

orthoMCL分析

get_coalescent_codon.py 从单基因第一和第二密码子位置、第三密码子位置的拼接比对中估计的基于共祖模型的系统树。
get_concatenated_codon.py 从所有基因的第一和第二密码子位置、第三密码子位置的拼接比对中估计的基于拼接模型的系统树。

01 trictly_single_copy

get_strictly_single_copy_orthogroup.py 这个脚本用于从OrthoMCL中选择严格的单拷贝直系同源组。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import collections

# read orthogroup of orthoMCL or orthoFinder
def readGroups(groups):
    orthogroups = {}
    with open(groups, "r") as OGs:
        for og in OGs:
            if not og.startswith("\t"):
                orthogroups0 = re.split(r'\s*', re.sub(',', ' ', og.rstrip()))
                orthogroups[orthogroups0[0]] = orthogroups0[1:]
    return orthogroups  # {OG1:[geneA1,geneB1,geneC1],OG2:[geneA2,geneB2]}

# stat orthogroup
def statGroups(orthogroups):
    orthogroups_stat = {}
    for og_id in orthogroups.keys():
        species = re.findall(r"(\w+)\|", "\t".join(orthogroups[og_id]))
        species_copy = dict(collections.Counter(species))
        orthogroups_stat[og_id] = species_copy
    return orthogroups_stat

# multiple of list
def mult(list0):
    result = 1
    for i in list0:
        result *= i
    return result


def main():
    orthogroups = readGroups(sys.argv[1])
    orthogroups_stat = statGroups(orthogroups)

    # select "Pso==2，Pni==2", and other species are single copy
    for og, stats in orthogroups_stat.items():
        if len(stats) == 22:
            if stats["Pso"] <= 2 and stats["Pni"] <= 2:
                sp_num = {}
                for sp, number in stats.items():
                    if sp == "Pso" or sp == "Pni":
                        pass
                    else:
                        if number == 1:
                            sp_num[sp] = number
                if sum(sp_num.values()) == 20 and mult(sp_num.values())  == 1:
                    print og


if __name__ == "__main__":
    main()

02 mostly_single_copy

get_mostly_single_copy_orthogroup.py 这个脚本用于从OrthoMCL中选择主要是单拷贝的直系同源组。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import collections

# read orthogroup of orthoMCL
def readGroups(groups):
    orthogroups = {}
    with open(groups, "r") as OGs:
        for og in OGs:
            if not og.startswith("\t"):
                orthogroups0 = re.split(r'\s*', re.sub(',', ' ', og.rstrip()))
                orthogroups[orthogroups0[0]] = orthogroups0[1:]
    return orthogroups  # {OG1:[geneA1,geneB1,geneC1],OG2:[geneA2,geneB2]}

# stat orthogroup
def statGroups(orthogroups):
    orthogroups_stat = {}
    for og_id in orthogroups.keys():
        species = re.findall(r"(\w+)\|", "\t".join(orthogroups[og_id]))
        species_copy = dict(collections.Counter(species))
        orthogroups_stat[og_id] = species_copy
    return orthogroups_stat

# multiple of list
def mult(list0):
    result = 1
    for i in list0:
        result *= i
    return result

# stat monocots, eudicots, magnoliids, basal angiosperm per OGs cotain number of genes
division = {
    "Smo": "Out",
    "Pab": "Out",
    "Gbi": "Out",
    "Atr": "Out",
    "Nad": "Out",
    "Spo": "Mon",
    "Peq": "Mon",
    "Mac": "Mon",
    "Sbi": "Mon",
    "Osa": "Mon",
    "Afi": "Mog",
    "Cmi": "Mog",
    "Lir": "Mog",
    "Pni": "Mog",
    "Pam": "Mog",
    "Aco": "Eud",
    "Pso": "Eud",
    "Nnu": "Eud",
    "Vvi": "Eud",
    "Sly": "Eud",
    "Ptr": "Eud",
    "Ath": "Eud"}


def main():
    orthogroups = readGroups(sys.argv[1])
    orthogroups_stat = statGroups(orthogroups)
    new_orthogroups_stat = {}
    for og,stats in orthogroups_stat.items():
        new_stats = {}
        for sp in stats:
            new_stats[(sp, division[sp])] = stats[sp]
        new_orthogroups_stat[og] = new_stats
    for og, stats in new_orthogroups_stat.items():
        out, mon, mog, eud = 0, 0, 0, 0
        out0, mon0, mog0, eud0 = 0, 0, 0, 0
        for sp_di in stats:
            if sp_di[1] == "Out":
                if stats[sp_di] == 1:
                    out += 1
                else:
                    out0 += 1
            elif sp_di[1] == "Mon":
                if stats[sp_di] == 1:
                    mon += 1
                else:
                    mon0 += 1
            elif sp_di[1] == "Mog":
                if sp_di[0] == "Pni":
                    if 0 < stats[sp_di] <= 2:
                        mog += 1
                    else:
                        mog0 += 1
                else:
                    if stats[sp_di] == 1:
                        mog += 1
                    else:
                        mog0 += 1
            elif sp_di[1] == "Eud":
                if sp_di[0] == "Pso":
                    if 0 < stats[sp_di] <= 2:
                        eud += 1
                    else:
                        eud0 += 1
                else:
                    if stats[sp_di] == 1:
                        eud += 1
                    else:
                        eud0 += 1
        if out0 == 0 and mon0 == 0 and mog0 == 0 and eud0 == 0:
            if out >= 3 and mon >= 3 and mog >= 3 and eud >= 4:
                print og

if __name__ == "__main__":
    main()

get_supermatrix_from_MSA_for_low_copy.py 这个脚本用于从OrthoMCL中获取基于拼接比对的超级矩阵（supermatrix）。