根据gff文件判断一段序列是否位于其内

最新推荐文章于 2020-12-30 08:11:17 发布

msw521sg

最新推荐文章于 2020-12-30 08:11:17 发布

阅读量826

点赞数

分类专栏：生物信息 python 文章标签：基因组-生物信息学 python gff3

本文链接：https://blog.csdn.net/msw521sg/article/details/52562229

版权

生物信息同时被 2 个专栏收录

54 篇文章 14 订阅

订阅专栏

python

27 篇文章 0 订阅

订阅专栏

1.txt

UN227692 chr6B 558820383 558820604 . 100.0 99.1 216 2 0 4
UN113387 chr7A 683635472 683635624 . 100.0 100.0 153 0 0 0
UN128584 chr7D 27592786 27593326 . 100.0 100.0 541 0 0 0
UN170802 chr4B 505369881 505370146 . 98.9 99.6 265 1 0 0
UN181903 chr3A 106517703 106518022 . 100.0 100.0 320 0 0 0
UN076932 chr2B 452598011 452598795 . 99.1 99.2 781 4 2 0
UN067930 chr3D 23548729 23549162 . 100.0 100.0 434 0 0 0
UN066624 chr2B 514684148 514685994 + 100.0 100.0 697 0 0 0
UN025033 chr5A 66831930 66832944 . 100.0 99.9 1014 0 1 0
UN011472 chr2D 600422765 600422990 . 100.0 99.6 225 1 0 0
UN061273 chr6D 445614030 445614663 + 98.4 99.6 538 1 1 0
UN156200 chr2B 32058281 32059267 + 97.9 99.6 275 0 1 0
UN185796 chr5A 585001939 585002769 - 100.0 99.6 745 3 0 0
UN175374 chr3D 34709434 34709880 . 100.0 98.4 440 7 0 0
UN184680 chr6A 603337083 603337320 . 100.0 100.0 238 0 0 0
UN088028 chr1A 581028511 581028919 . 97.4 100.0 409 0 0 0

augustus.gff

chr1A	AUGUSTUS	gene	4608	5033	0.95	+	.	g1
chr1A	AUGUSTUS	transcript	4608	5033	0.95	+	.	g1.t1
chr1A	AUGUSTUS	start_codon	4608	4610	.	+	0	transcript_id "g1.t1"; gene_id "g1";
chr1A	AUGUSTUS	CDS	4608	5033	0.95	+	0	transcript_id "g1.t1"; gene_id "g1";
chr1A	AUGUSTUS	exon	4608	5033	.	+	.	transcript_id "g1.t1"; gene_id "g1";
chr1A	AUGUSTUS	stop_codon	5031	5033	.	+	0	transcript_id "g1.t1"; gene_id "g1";
chr1A	AUGUSTUS	gene	7095	7403	0.97	+	.	g2
chr1A	AUGUSTUS	transcript	7095	7403	0.97	+	.	g2.t1
chr1A	AUGUSTUS	start_codon	7095	7097	.	+	0	transcript_id "g2.t1"; gene_id "g2";
chr1A	AUGUSTUS	CDS	7095	7403	0.97	+	0	transcript_id "g2.t1"; gene_id "g2";
chr1A	AUGUSTUS	exon	7095	7403	.	+	.	transcript_id "g2.t1"; gene_id "g2";
chr1A	AUGUSTUS	stop_codon	7401	7403	.	+	0	transcript_id "g2.t1"; gene_id "g2";
chr1A	AUGUSTUS	gene	7752	8069	0.59	-	.	g3
chr1A	AUGUSTUS	transcript	7752	8069	0.59	-	.	g3.t1
chr1A	AUGUSTUS	stop_codon	7752	7754	.	-	0	transcript_id "g3.t1"; gene_id "g3";
chr1A	AUGUSTUS	CDS	7752	8069	0.59	-	0	transcript_id "g3.t1"; gene_id "g3";
chr1A	AUGUSTUS	exon	7752	8069	.	-	.	transcript_id "g3.t1"; gene_id "g3";
chr1A	AUGUSTUS	start_codon	8067	8069	.	-	0	transcript_id "g3.t1"; gene_id "g3";
chr1A	AUGUSTUS	gene	19696	20019	0.75	+	.	g4
chr1A	AUGUSTUS	transcript	19696	20019	0.75	+	.	g4.t1
chr1A	AUGUSTUS	start_codon	19696	19698	.	+	0	transcript_id "g4.t1"; gene_id "g4";
chr1A	AUGUSTUS	CDS	19696	20019	0.75	+	0	transcript_id "g4.t1"; gene_id "g4";
chr1A	AUGUSTUS	exon	19696	20019	.	+	.	transcript_id "g4.t1"; gene_id "g4";
chr1A	AUGUSTUS	stop_codon	20017	20019	.	+	0	transcript_id "g4.t1"; gene_id "g4";
chr1A	AUGUSTUS	gene	22080	22388	1	+	.	g5
chr1A	AUGUSTUS	transcript	22080	22388	1	+	.	g5.t1
chr1A	AUGUSTUS	start_codon	22080	22082	.	+	0	transcript_id "g5.t1"; gene_id "g5";
chr1A	AUGUSTUS	CDS	22080	22388	1	+	0	transcript_id "g5.t1"; gene_id "g5";
chr1A	AUGUSTUS	exon	22080	22388	.	+	.	transcript_id "g5.t1"; gene_id "g5";
chr1A	AUGUSTUS	stop_codon	22386	22388	.	+	0	transcript_id "g5.t1"; gene_id "g5";
chr1A	AUGUSTUS	gene	22678	23055	0.95	-	.	g6
chr1A	AUGUSTUS	transcript	22678	23055	0.95	-	.	g6.t1
chr1A	AUGUSTUS	stop_codon	22678	22680	.	-	0	transcript_id "g6.t1"; gene_id "g6";
chr1A	AUGUSTUS	CDS	22678	23055	0.95	-	0	transcript_id "g6.t1"; gene_id "g6";
chr1A	AUGUSTUS	exon	22678	23055	.	-	.	transcript_id "g6.t1"; gene_id "g6";
chr1A	AUGUSTUS	start_codon	23053	23055	.	-	0	transcript_id "g6.t1"; gene_id "g6";
chr1A	AUGUSTUS	gene	24298	24594	0.58	+	.	g7
chr1A	AUGUSTUS	transcript	24298	24594	0.58	+	.	g7.t1
chr1A	AUGUSTUS	start_codon	24298	24300	.	+	0	transcript_id "g7.t1"; gene_id "g7";
chr1A	AUGUSTUS	CDS	24298	24594	0.58	+	0	transcript_id "g7.t1"; gene_id "g7";
chr1A	AUGUSTUS	exon	24298	24594	.	+	.	transcript_id "g7.t1"; gene_id "g7";
chr1A	AUGUSTUS	stop_codon	24592	24594	.	+	0	transcript_id "g7.t1"; gene_id "g7";

#!/usr/bin/env python
# encoding: utf-8

from collections import defaultdict
from multiprocessing import Pool, cpu_count
from functools import partial


def find_sth(f2, f1=None):
    start, end = int(f2[3]), int(f2[4])
    for uno1, start1, end1 in f1[f2[0]]:
        if (start1 <= start and start <= end1) or (start1 <= end and end <= end1) or (start1 >= start and end >= end1):
            with open("out.txt", "a") as fh:
                fh.write(uno1 + "\t" + f2[-1] + "\n")
                #print(uno1, f2[-1])


def main():
    with open('1.txt', 'r') as f1:
        infile1 = defaultdict(set)
        for uno1, chr1, start1, end1, *others in map(str.split, f1):
            infile1[chr1].add((uno1, int(start1), int(end1)))
    with open('genemark.gff3', 'r') as f2:
        infile2 = [x for x in map(str.split, f2) if x[2] == 'gene']
    pool = Pool(cpu_count())
    pool.map(partial(find_sth, f1=infile1), infile2)
    pool.close()
    pool.join()


if __name__ == "__main__":
    main()

from collections import defaultdict
from intervaltree import Interval, IntervalTree

with open('1.txt') as f:
    d1 = defaultdict(list)
    xs = map(lambda x: x.strip().split(), f)
    for x in xs:
        y = (x[0], int(x[2]), int(x[3]))
        d1[x[1]].append(y)

    for k, v in d1.items():
        d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v)

with open('genemark.gff3') as f:
    for line in f:
        line = line.strip().split()
        if line[2] == 'gene':
            chr, start, end = line[0], int(line[3]), int(line[4])
            for start1, end1, un1 in d1[chr][start-1:end+1]:
                print(un1, line[-1])

#!/usr/bin/env python
# -*- coding: utf-8 -*-
infile2 = open('augustus.gtf', 'r')
infile1 = set(line1.strip() for line1 in open('1.txt', 'r'))
for line in infile2:
    line = line.strip().split()
    if line[2] == 'transcript':
        chr, start, end = line[0], int(line[3]), int(line[4])
        for line1 in infile1:
            line1 = line1.split()
            chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3])
            if chr1 == chr:
                if start1 < start < end1:
                    print line1[0], line[-1]
                    break
                if start1 < end < end1:
                    print line1[0], line[-1]
                    break
            else:
                continue

msw521sg

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
根据gff文件判断一段序列是否位于其内

1.txtUN227692 chr6B 558820383 558820604 . 100.0 99.1 216 2 0 4UN113387 chr7A 683635472 683635624 . 100.0 100.0 153 0 0 0UN128584 chr7D 27592786 27593326 . 100.0 100.0 541 0 0 0UN170802 chr4B 5053
复制链接

扫一扫

专栏目录