「像读文献一样读代码」第一期：如何解析GTF文件进行统计分析？

本文链接：https://blog.csdn.net/qq524730309/article/details/126336181

测试数据下载

wget -c ftp://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.chr.gtf.gz

先验知识

代码解读

实现了哪些功能？

1）读取GTF文件

2）根据feature，将其分为3种数据类型进行保存，即gene、transcript、exon。

每一种数据类型，分别构建一种对象，每种对象所具有的属性定义如下，

Object gene包含的属性有：所属的染色体，起始位置，终止位置，正负链信息，gene id（对应GTF文件中的gene_id）
Object transcript包含的属性有：所属的染色体，起始位置，终止位置，transcript id（对应GTF文件中的transcript_id），parent id（对应GTF文件中的gene_id）
Object exon包含的属性有：所属的染色体，起始位置，终止位置，parent id（对应GTF文件中的transcript_id）

3）数据处理

统计每一条染色体上的gene总数
计算每一个gene的长度
统计每个gene的转录本数
记录每个exon的坐标

代码设计思路

将GTF文件的每一行看作一个对象来处理，构建不同类型的对象对信息进行存储，同时由于不同的对象之间存在信息交集，这就是类继承的应用之处。

Python代码

在正则表达式上，我进行了一定修改。

'''
Date: 2022.8.14
Description: Kind of a collection of tools to parse and analyze GTF
Main Function:
- count gene number of each chromosome
- count transcritp number of each gene
- calculate the total number of exon and intron of each transcript
- plot the result
'''
import sys
import re

args = sys.argv  # Set argv to grep the ipnut. argv is a list, e.g. [, , ]

class Genome_info():
    def __init__(self):
        self.chr = ""
        self.start = 0
        self.end = 0

class Gene(Genome_info):
    '''
    Note: Object Gene is inherited from Genome_info
    Gene has two attributes,
    1) orientation
    2) id (gene id)
    '''
    def __init__(self):
        Genome_info.__init__(self)
        self.orientation = ""
        self.id = ""

class Transcript(Genome_info):
    '''
    Note: Object Transcript is inherited from Genome_info
    Transcript has two attributes,
    1) id (transcript id)
    2) parent (gene id)
    '''
    def __init__(self):
        Genome_info.__init__(self)
        self.id = ""
        self.parent = ""

class Exon(Genome_info):
    '''
    Note: Object Exon is inherited from Genome_info
    Transcript has one attributes,
    2) parent (?)
    '''
    def __init__(self):
        Genome_info.__init__(self)
        self.parent = ""


'''
Define a main function to handle gtf file
'''
def main(args):

    list_chr = []
    list_gene = {}
    list_transcript = {}
    list_exon = []
    # l_n = 0
    with open(args[1]) as fp_gtf:
        for line in fp_gtf:
            if line.startswith("#"):
                continue
            # print ("in %s" % l_n)
            # l_n += 1
            lines = line.strip("\n").split("\t")
            chr = lines[0]
            type = lines[2]
            start = int(lines[3])
            end = int(lines[4])
            orientation = lines[6]
            attr = lines[8]
            if not re.search(r'protein_coding', attr):   # 只针对protein_coding进行统计分析
                continue

            if not chr in list_chr :
                list_chr.append(chr)

            if type == "gene":
                gene = Gene()
                id = re.search(r'gene_id "([^;]+)";?', attr).group(1)  # Original version, could be adapted to r'gene_id "(\S+)";'
                # id = re.search(r'gene_id "(\S+)";', attr).group(1)      # My adaptation
                gene.chr = chr
                gene.start = start
                gene.end = end
                gene.id = id
                gene.orientation = orientation
                list_gene[id] = gene  # list_gene is a dict to depost gene id and its correspoding Gene Object
                # print(id)
            
            elif type == "transcript":
                transcript = Transcript()
                id = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
                # id = re.search(r'transcript_id "(\S+)";', attr).group(1)      # My adaptation
                parent = re.search(r'gene_id "([^;]+)";?', attr).group(1)
                # id = re.search(r'gene_id "(\S+)";', attr).group(1)      # My adaptation
                
                if not parent in list_gene:
                    continue
                transcript.chr = chr
                transcript.start = start
                transcript.end = end
                transcript.id = id
                transcript.parent = parent
                list_transcript[id] = transcript

            elif type == "exon":
                exon = Exon()
                parent = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
                # parent = re.search(r'transcript_id "(\S+)";', attr).group(1) # My adaptation
                
                if not parent in list_transcript:
                    continue
                exon.chr = chr
                exon.start = start
                exon.end = end
                exon.parent = parent
                list_exon.append(exon)

    chr_gene(list_gene)
    gene_len(list_gene)
    gene_transcript(list_transcript)
    transcript_exon(list_exon)
    exon_pos(list_exon)

def chr_gene(list_gene):
    print("-------------------------------- Count gene number of each chromosome --------------------------------")
    count_gene = {}
    for info in list_gene.values():
        chr = info.chr
        if chr in count_gene:
            count_gene[info.chr] += 1
        else:
            count_gene[info.chr] = 1
    with open("chr_gene.txt", 'w') as fp_out:
        for chr, num in count_gene.items():
            print("\t".join([chr, str(num)]) + "\n")
            fp_out.write("\t".join([chr, str(num)]) + "\n")


def gene_len(list_gene):
    print("-------------------------------- Calculate the length of each gene --------------------------------")
    with open("gene_len.txt", 'w') as fp_out:
        for gene_id, info in list_gene.items():
            len = info.end - info.start + 1
            fp_out.write("\t".join([gene_id, str(len)]) + "\n")
            # print("\t".join([gene_id, str(len)]) + "\n")

def gene_transcript(list_transcript):
    print("-------------------------------- Count transcript number of each gene --------------------------------")
    count_transcript = {}
    for info in list_transcript.values():
        gene_id = info.parent
        if gene_id in count_transcript:
            count_transcript[gene_id] += 1
        else:
            count_transcript[gene_id] = 1
    with open("gene_transcript.txt", 'w') as fp_out:
        for gene_id, num in count_transcript.items():
            # print("\t".join([gene_id, str(num)]) + "\n")
            fp_out.write("\t".join([gene_id, str(num)]) + "\n")


def transcript_exon(list_exon):
    print("-------------------------------- Count exon number of each transcript --------------------------------")
    count_exon = {}
    for exon in list_exon:
        transcript_id = exon.parent
        if transcript_id in count_exon:
            count_exon[transcript_id] += 1
        else:
            count_exon[transcript_id] = 1
    with open("transcript_exon.txt", 'w') as fp_out:
        for transcript_id, num in count_exon.items():
            # print("\t".join([transcript_id, str(num)]) + "\n")
            fp_out.write("\t".join([transcript_id, str(num)]) + "\n")

def exon_pos(list_exon):
    print("-------------------------------- Save the position info of each exon --------------------------------")
    count_exon = {}
    for exon in list_exon:
        transcript_id = exon.parent
        if transcript_id in count_exon:
            count_exon[transcript_id] += ",%s-%s" % (str(exon.start), str(exon.end))
        else:
            count_exon[transcript_id] = "%s-%s" % (str(exon.start), str(exon.end))
    with open("exon_pos.txt", 'w') as fp_out:
        for transcript_id, pos in count_exon.items():
            print("\t".join([transcript_id, pos]) + "\n")
            fp_out.write("\t".join([transcript_id, pos]) + "\n")


def gene_exon_pos(list_gene, list_transcript, list_exon):
    pass


if __name__ == "__main__":
    main(args)