python统计低复杂度区域——（待完善）_蛋白低复杂度区域软件-CSDN博客

本文链接：https://blog.csdn.net/yin1331102028yin/article/details/135058384
import csv

def count_low_complexity_residues(sequence):
    # 计算低复杂度区域的残基个数
    total_residues = len(sequence)
    #print('total_residues:', total_residues)
    low_complexity_residues = int(total_residues * 0.04)
    return low_complexity_residues

def find_low_complexity_regions(sequence, max_residues):
    # 找到低复杂度区域的起始和结束位置
    regions = []
    current_region = {'start': 0, 'end': 0, 'count': 0}

    for i in range(len(sequence)):
        if sequence[i].islower():
            current_region['end'] = i
            current_region['count'] += 1
        else:
            if current_region['count'] > 0:
                regions.append(current_region)
                current_region = {'start': i, 'end': i, 'count': 0}

    # 添加最后一个区域
    if current_region['count'] > 0:
        regions.append(current_region)

    # 找到符合条件的区域
    valid_regions = [region for region in regions if region['count'] <= max_residues]

    return valid_regions

def process_file(file_path):
    # 读取文件并处理每个蛋白质序列
    with open(file_path, 'r') as file:
        lines = file.readlines()

    proteins = []
    current_protein = None

    for line in lines:
        line = line.strip()

        if line.startswith('>'):
            if current_protein:
                proteins.append(current_protein)
            current_protein = {'name': line[1:], 'sequence': ''}
            print('current_protein:', current_protein)
        else:
            current_protein['sequence'] += line

    if current_protein:
        proteins.append(current_protein)

    # 统计低复杂度区域符合要求的蛋白质
    valid_proteins = []

    for protein in proteins:
        low_complexity_residues = count_low_complexity_residues(protein['sequence'])

        if low_complexity_residues <= 3:  # 79 * 0.04 = 3.16，取整数部分为3
            valid_regions = find_low_complexity_regions(protein['sequence'], low_complexity_residues)
            valid_proteins.append({'name': protein['name'], 'regions': valid_regions, 'count': low_complexity_residues})

    return valid_proteins

def write_to_csv(result, output_file):
    # 将结果写入CSV文件
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['Protein Name', 'Region Start', 'Region End', 'Region Count']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()

        for protein in result:
            for region in protein['regions']:
                writer.writerow({
                    'Protein Name': protein['name'],
                    'Region Start': region['start'] + 1,
                    'Region End': region['end'] + 1,
                    'Region Count': region['count']
                })

if __name__ == "__main__":
    file_path = './5.0-lcr.interval'  # 替换成你的文件路径
    output_file = "output02.csv"  # 替换成你的输出文件路径

    result = process_file(file_path)
    #print('result:', result)
    write_to_csv(result, output_file)

    print(f"结果已成功写入到 {output_file}")