你需要的是一个GTF文件解析器,例如:GTF Parser
根据你的需要我造了一个轮子(FTKN?图上标出来的好像是FPKM)
假设有如下GTF文件(纯属捏造):
chr1 Cufflinks exon 11874 12227 . + . gene_id "XLOC_000001"; transcript_id "TCONS_00000003"; exon_number "1"; gene_name "uc010nxr.1"; oId "uc010nxr.1"; nearest_ref "uc010nxr.1"; class_code "="; tss_id "TSS1";
chr1 Cufflinks transcript 12646 12697 . + . gene_id "XLOC_000001"; transcript_id "TCONS_00000003"; exon_number "2"; gene_name "uc010nxr.1"; oId "uc010nxr.1"; nearest_ref "uc010nxr.1"; class_code "="; tss_id "TSS1"; FPKM "100.1";
chr1 Cufflinks exon 13221 14409 . + . gene_id "XLOC_000001"; transcript_id "TCONS_00000003"; exon_number "3"; gene_name "uc010nxr.1"; oId "uc010nxr.1"; nearest_ref "uc010nxr.1"; class_code "="; tss_id "TSS1";
chr1 Cufflinks transcript 11874 12227 . + . gene_id "XLOC_000001"; transcript_id "TCONS_00000002"; exon_number "1"; gene_name "uc010nxq.1"; oId "uc010nxq.1"; nearest_ref "uc010nxq.1"; class_code "="; tss_id "TSS1"; p_id "P1";
chr1 Cufflinks exon 12595 12721 . + . gene_id "XLOC_000001"; transcript_id "TCONS_00000002"; exon_number "2"; gene_name "uc010nxq.1"; oId "uc010nxq.1"; nearest_ref "uc010nxq.1"; class_code "="; tss_id "TSS1"; p_id "P1";
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, absolute_import
class GTF(object):
def __init__(self):
"""
gtf文件的9列
"""
self._seqid = None
self._source = None
self._type = None
self._start = None
self._end = None
self._score = None
self._strand = None
self._phase = None
self._attributes = None
def parse_line(self, line):
"""
解析一行
:return: 填充完属性之后的对象
"""
(self._seqid, self._source, self._type, self._start, self._end,
self._score, self._strand, self._phase, attribute_string) = line.rstrip().split('\t')
self._attributes = {}
key_value_pair_set = attribute_string.split('; ') # 除了最后一个每个都是按照分号加空格分割的
for key_value_pair in key_value_pair_set[: -1]: # 最后一个比较特殊,特殊处理
key, value = key_value_pair.split(' ')
self._attributes[key] = value[1: -1] # 去除第一个字符和最后一个字符
key, value = key_value_pair_set[-1][: -1].split(' ') # 去除最后一个的分号
self._attributes[key] = value[1: -1]
return self
def get_transcript_fpkm(self):
if self._type == 'transcript':
return self._attributes.get('FPKM', None)
return None
def get_attribute(self, key):
return self._attributes.get(key, None)
if __name__ == '__main__':
import sys
try:
with open(sys.argv[1]) as f:
for l in f:
if not l.startswith('#'): # 去除注释行
gtf = GTF().parse_line(l)
fpkm = gtf.get_transcript_fpkm()
if fpkm:
print('Transcript ID: %s\tFPKM: %s' % (gtf.get_attribute('transcript_id'), fpkm))
except IndexError:
print('Usage: python %s [path-to-gtf-file]' % __file__)
使用方法:python fpkm.py example.gtf
输出:Transcript ID: TCONS_00000003 FPKM: 100.1