如何获取mostly单拷贝和strictly单拷贝基因

参考这篇文章的有意思的分析脚本~

如何严格定义单拷贝基因的类型,值得思考。

Insights into angiosperm evolution, floral development and chemical biosynthesis from the Aristolochia fimbriata genome

https://github.com/yihenghu/Aristolochia_fimbriata_genome_analysis    #脚本github地址

orthoMCL分析

get_coalescent_codon.py 从单基因第一和第二密码子位置、第三密码子位置的拼接比对中估计的基于共祖模型的系统树。
get_concatenated_codon.py 从所有基因的第一和第二密码子位置、第三密码子位置的拼接比对中估计的基于拼接模型的系统树。

01 trictly_single_copy

get_strictly_single_copy_orthogroup.py 这个脚本用于从OrthoMCL中选择严格的单拷贝直系同源组。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import collections

# read orthogroup of orthoMCL or orthoFinder
def readGroups(groups):
    orthogroups = {}
    with open(groups, "r") as OGs:
        for og in OGs:
            if not og.startswith("\t"):
                orthogroups0 = re.split(r'\s*', re.sub(',', ' ', og.rstrip()))
                orthogroups[orthogroups0[0]] = orthogroups0[1:]
    return orthogroups  # {OG1:[geneA1,geneB1,geneC1],OG2:[geneA2,geneB2]}

# stat orthogroup
def statGroups(orthogroups):
    orthogroups_stat = {}
    for og_id in orthogroups.keys():
        species = re.findall(r"(\w+)\|", "\t".join(orthogroups[og_id]))
        species_copy = dict(collections.Counter(species))
        orthogroups_stat[og_id] = species_copy
    return orthogroups_stat

# multiple of list
def mult(list0):
    result = 1
    for i in list0:
        result *= i
    return result


def main():
    orthogroups = readGroups(sys.argv[1])
    orthogroups_stat = statGroups(orthogroups)

    # select "Pso==2,Pni==2", and other species are single copy
    for og, stats in orthogroups_stat.items():
        if len(stats) == 22:
            if stats["Pso"] <= 2 and stats["Pni"] <= 2:
                sp_num = {}
                for sp, number in stats.items():
                    if sp == "Pso" or sp == "Pni":
                        pass
                    else:
                        if number == 1:
                            sp_num[sp] = number
                if sum(sp_num.values()) == 20 and mult(sp_num.values())  == 1:
                    print og


if __name__ == "__main__":
    main()
02 mostly_single_copy

get_mostly_single_copy_orthogroup.py 这个脚本用于从OrthoMCL中选择主要是单拷贝的直系同源组。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import collections

# read orthogroup of orthoMCL
def readGroups(groups):
    orthogroups = {}
    with open(groups, "r") as OGs:
        for og in OGs:
            if not og.startswith("\t"):
                orthogroups0 = re.split(r'\s*', re.sub(',', ' ', og.rstrip()))
                orthogroups[orthogroups0[0]] = orthogroups0[1:]
    return orthogroups  # {OG1:[geneA1,geneB1,geneC1],OG2:[geneA2,geneB2]}

# stat orthogroup
def statGroups(orthogroups):
    orthogroups_stat = {}
    for og_id in orthogroups.keys():
        species = re.findall(r"(\w+)\|", "\t".join(orthogroups[og_id]))
        species_copy = dict(collections.Counter(species))
        orthogroups_stat[og_id] = species_copy
    return orthogroups_stat

# multiple of list
def mult(list0):
    result = 1
    for i in list0:
        result *= i
    return result

# stat monocots, eudicots, magnoliids, basal angiosperm per OGs cotain number of genes
division = {
    "Smo": "Out",
    "Pab": "Out",
    "Gbi": "Out",
    "Atr": "Out",
    "Nad": "Out",
    "Spo": "Mon",
    "Peq": "Mon",
    "Mac": "Mon",
    "Sbi": "Mon",
    "Osa": "Mon",
    "Afi": "Mog",
    "Cmi": "Mog",
    "Lir": "Mog",
    "Pni": "Mog",
    "Pam": "Mog",
    "Aco": "Eud",
    "Pso": "Eud",
    "Nnu": "Eud",
    "Vvi": "Eud",
    "Sly": "Eud",
    "Ptr": "Eud",
    "Ath": "Eud"}


def main():
    orthogroups = readGroups(sys.argv[1])
    orthogroups_stat = statGroups(orthogroups)
    new_orthogroups_stat = {}
    for og,stats in orthogroups_stat.items():
        new_stats = {}
        for sp in stats:
            new_stats[(sp, division[sp])] = stats[sp]
        new_orthogroups_stat[og] = new_stats
    for og, stats in new_orthogroups_stat.items():
        out, mon, mog, eud = 0, 0, 0, 0
        out0, mon0, mog0, eud0 = 0, 0, 0, 0
        for sp_di in stats:
            if sp_di[1] == "Out":
                if stats[sp_di] == 1:
                    out += 1
                else:
                    out0 += 1
            elif sp_di[1] == "Mon":
                if stats[sp_di] == 1:
                    mon += 1
                else:
                    mon0 += 1
            elif sp_di[1] == "Mog":
                if sp_di[0] == "Pni":
                    if 0 < stats[sp_di] <= 2:
                        mog += 1
                    else:
                        mog0 += 1
                else:
                    if stats[sp_di] == 1:
                        mog += 1
                    else:
                        mog0 += 1
            elif sp_di[1] == "Eud":
                if sp_di[0] == "Pso":
                    if 0 < stats[sp_di] <= 2:
                        eud += 1
                    else:
                        eud0 += 1
                else:
                    if stats[sp_di] == 1:
                        eud += 1
                    else:
                        eud0 += 1
        if out0 == 0 and mon0 == 0 and mog0 == 0 and eud0 == 0:
            if out >= 3 and mon >= 3 and mog >= 3 and eud >= 4:
                print og

if __name__ == "__main__":
    main()


get_supermatrix_from_MSA_for_low_copy.py 这个脚本用于从OrthoMCL中获取基于拼接比对的超级矩阵(supermatrix)。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值