批量提取antiSMASH核心基因编码AA序列（JSON文件）

Chenhu7

已于 2022-02-04 11:43:47 修改

阅读量905

点赞数 1

分类专栏：一些生信工具文章标签： json 生物信息学

于 2021-08-10 23:05:16 首次发布

本文链接：https://blog.csdn.net/Curry_chenhu/article/details/119581494

版权

一些生信工具专栏收录该内容

15 篇文章 9 订阅

订阅专栏

此博客介绍了如何使用Python脚本从Antismash JSON文件中提取核心蛋白质序列，通过逐个处理json文件，解析注释基因簇并筛选出生物合成基因，最终将结果保存为FASTA格式。同时，它还提及了错误处理机制，确保了在处理过程中记录错误信息。

摘要由CSDN通过智能技术生成

import json
import os

def extract(json_path,out_path):
    """
    :param json_file: antismash结果json文件父目录
    :param out_path: core核心蛋白质序列存放目录
    """

    json_files=os.listdir(json_path)

    #创建结果存放目录
    isExists = os.path.exists(out_path)
    if not isExists:
        out_dir=os.mkdir(out_path)
    file_error = open("error_log.txt", "w+")
    for json_file in json_files:
        try:
            if json_file[-4:]=="json":
                file=open(json_path+json_file)
                lines = [line.strip('\n') for line in file.readlines()]
                file.close()

                string="".join(lines)
                root=json.loads(string)

                #所有contigs序列antismash信息
                records=root['records']

                #所有被注释的基因簇
                timings=list(dict.keys(root["timings"]))
                # print(timings)

                # 写入fasta文件

                file_w=open(out_path+json_file[:-5]+".faa","w+")

                # 遍历records
                for record in  records:
                    record_id=record["id"]

                    # 选取antismash注释records(contig)
                    if record_id in timings:
                        # 基因簇特征序列信息
                        features=record["features"]

                        for feature in features:
                            # 基因或cds信息
                            qualifiers=feature["qualifiers"]

                            # 获取核心编码基因信息
                            if "gene_kind" in qualifiers.keys() and \
                                    qualifiers["gene_kind"]==["biosynthetic"]:

                                # 位置
                                loc=feature["location"]
                                # print(loc)
                                # AA序列
                                AA_seq=qualifiers["translation"][0]
                                # print(AA_seq)

                                # AA序列contig 写入fasta文件
                                string_head=">"+record_id + "|"+loc
                                file_w.write(string_head+"\n")
                                file_w.write(AA_seq+"\n")
                file_w.close()
        except:
                file_error = open("error_log.txt", "w+")
                file_error.write(json_file)

def main():
    extract("../JSON_anismash/","out_path/")

if __name__ == '__main__':
    main()