gff3文件改为mRNA形式

不懂python不懂R

已于 2024-04-30 16:07:11 修改

阅读量173

点赞数 9

分类专栏： python 文章标签： python

于 2024-04-30 09:30:55 首次发布

本文链接：https://blog.csdn.net/weixin_44231554/article/details/138335385

版权

python 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

1.输入文件，gff3文件
2.脚本：windows版本

windows版本

import csv

def parse_gff3(input_file):
    features = {}
    with open(input_file, 'r', encoding='utf-8') as file:  # 指定文件编码
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[0].startswith('#') or len(row) < 9:
                continue
            seqid, source, feature_type, start, end, score, strand, phase, attributes = row
            attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
            if feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                parent_id = attr_dict.get('Parent')
                if parent_id in features:
                    if feature_type not in features[parent_id]:
                        features[parent_id][feature_type] = []
                    features[parent_id][feature_type].append({'start': int(start), 'end': int(end)})
                else:
                    features[parent_id] = {feature_type: [{'start': int(start), 'end': int(end)}]}
    return features

def write_new_gff3(features, output_file):
    with open(output_file, 'w', encoding='utf-8', newline='') as file:  # 避免 Windows 下的空行问题
        file.write("ID\tType\tStart\tEnd\tLength\n")
        for transcript_id, data in features.items():
            position = 1
            for feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                if feature_type in data:
                    combined_coords = sorted(data[feature_type], key=lambda x: x['start'])
                    total_length = sum(coord['end'] - coord['start'] + 1 for coord in combined_coords)
                    new_end = position + total_length - 1
                    file.write(f"{transcript_id}\t{feature_type}\t{position}\t{new_end}\t{total_length}\n")
                    position = new_end + 1

# Hardcoded file paths
input_file = 'E:/pythonworking/file/20240430.txt'
output_file = 'E:/pythonworking/file/20240430out.txt'

features = parse_gff3(input_file)
write_new_gff3(features, output_file)

linux版本：

import csv
import argparse

def parse_gff3(input_file):
    """ Read a GFF3 file and parse 5'UTR, CDS, and 3'UTR features of transcripts """
    features = {}
    with open(input_file, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row[0].startswith('#') or len(row) < 9:
                continue
            seqid, source, feature_type, start, end, score, strand, phase, attributes = row
            attr_dict = {attr.split('=')[0]: attr.split('=')[1] for attr in attributes.split(';') if '=' in attr}
            if feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                parent_id = attr_dict.get('Parent')
                if parent_id in features:
                    if feature_type not in features[parent_id]:
                        features[parent_id][feature_type] = []
                    features[parent_id][feature_type].append({
                        'start': int(start),
                        'end': int(end)
                    })
                else:
                    features[parent_id] = {feature_type: [{'start': int(start), 'end': int(end)}]}

    return features

def write_new_gff3(features, output_file):
    """ Write a new GFF3 file based on parsed features, including ID, Type, Start, End, and Length """
    with open(output_file, 'w') as file:
        file.write("ID\tType\tStart\tEnd\tLength\n")
        for transcript_id, data in features.items():
            position = 1
            for feature_type in ['five_prime_UTR', 'CDS', 'three_prime_UTR']:
                if feature_type in data:
                    coords_list = data[feature_type]
                    # Combine all coordinates and sort them
                    combined_coords = sorted(coords_list, key=lambda x: x['start'])
                    # Calculate the total length of combined segments
                    total_length = sum(coord['end'] - coord['start'] + 1 for coord in combined_coords)
                    new_end = position + total_length - 1
                    file.write(f"{transcript_id}\t{feature_type}\t{position}\t{new_end}\t{total_length}\n")
                    position = new_end + 1  # Update the position for the next feature

def main():
    parser = argparse.ArgumentParser(description="Reformat GFF3 file to transcript-centric simplified format with additional length info")
    parser.add_argument("-i", "--input", required=True, help="Input GFF3 file path")
    parser.add_argument("-o", "--output", required=True, help="Output GFF3 file path")

    args = parser.parse_args()

    # Parse the GFF3 file
    features = parse_gff3(args.input)
    # Output the new format GFF3 file
    write_new_gff3(features, args.output)

if __name__ == "__main__":
    main()