gff3文件改为mRNA形式

1.输入文件,gff3文件
2.脚本:windows版本

windows版本

import csv

def parse_gff3(input_file):
    features = {}
    with open(input_file, 'r', encoding='utf-8') as file:  # 指定文件编码
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[0].startswith('#') or len(row) < 9:
                continue
            seqid, source, feature_type, start, end, score, strand, phase, attributes = row
            attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
            if feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                parent_id = attr_dict.get('Parent')
                if parent_id in features:
                    if feature_type not in features[parent_id]:
                        features[parent_id][feature_type] = []
                    features[parent_id][feature_type].append({'start': int(start), 'end': int(end)})
                else:
                    features[parent_id] = {feature_type: [{'start': int(start), 'end': int(end)}]}
    return features

def write_new_gff3(features, output_file):
    with open(output_file, 'w', encoding='utf-8', newline='') as file:  # 避免 Windows 下的空行问题
        file.write("ID\tType\tStart\tEnd\tLength\n")
        for transcript_id, data in features.items():
            position = 1
            for feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                if feature_type in data:
                    combined_coords = sorted(data[feature_type], key=lambda x: x['start'])
                    total_length = sum(coord['end'] - coord['start'] + 1 for coord in combined_coords)
                    new_end = position + total_length - 1
                    file.write(f"{transcript_id}\t{feature_type}\t{position}\t{new_end}\t{total_length}\n")
                    position = new_end + 1

# Hardcoded file paths
input_file = 'E:/pythonworking/file/20240430.txt'
output_file = 'E:/pythonworking/file/20240430out.txt'

features = parse_gff3(input_file)
write_new_gff3(features, output_file)

linux版本:

import csv
import argparse

def parse_gff3(input_file):
    """ Read a GFF3 file and parse 5'UTR, CDS, and 3'UTR features of transcripts """
    features = {}
    with open(input_file, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[0].startswith('#') or len(row) < 9:
                continue
            seqid, source, feature_type, start, end, score, strand, phase, attributes = row
            attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
            if feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                parent_id = attr_dict.get('Parent')
                if parent_id in features:
                    if feature_type not in features[parent_id]:
                        features[parent_id][feature_type] = []
                    features[parent_id][feature_type].append({
                        'start': int(start),
                        'end': int(end)
                    })
                else:
                    features[parent_id] = {feature_type: [{'start': int(start), 'end': int(end)}]}

    return features

def write_new_gff3(features, output_file):
    """ Write a new GFF3 file based on parsed features, including ID, Type, Start, End, and Length """
    with open(output_file, 'w') as file:
        file.write("ID\tType\tStart\tEnd\tLength\n")
        for transcript_id, data in features.items():
            position = 1
            for feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                if feature_type in data:
                    coords_list = data[feature_type]
                    # Combine all coordinates and sort them
                    combined_coords = sorted(coords_list, key=lambda x: x['start'])
                    # Calculate the total length of combined segments
                    total_length = sum(coord['end'] - coord['start'] + 1 for coord in combined_coords)
                    new_end = position + total_length - 1
                    file.write(f"{transcript_id}\t{feature_type}\t{position}\t{new_end}\t{total_length}\n")
                    position = new_end + 1  # Update the position for the next feature

def main():
    parser = argparse.ArgumentParser(description="Reformat GFF3 file to transcript-centric simplified format with additional length info")
    parser.add_argument("-i", "--input", required=True, help="Input GFF3 file path")
    parser.add_argument("-o", "--output", required=True, help="Output GFF3 file path")

    args = parser.parse_args()

    # Parse the GFF3 file
    features = parse_gff3(args.input)
    # Output the new format GFF3 file
    write_new_gff3(features, args.output)

if __name__ == "__main__":
    main()

3.输出文件
在这里插入图片描述

  • 9
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值