# -*- coding: utf-8 -*-
"""
version: python 3.0
usage: python get_ASprofile_ref_hdrs.py path/species.genome.fa species
"""
import sys
import re
import fileinput
import pandas as pd
import os.path
if len(sys.argv) < 3:
sys.exit("python error")
FA = sys.argv[1]
species = sys.argv[2]
dic_chr = {}
temp_chr = ''
for line in fileinput.input(FA):
line = line.strip()
pat = re.compile(r'^>')
match = pat.match(line)
if match:
a = line.split(" ")[0]
temp_chr = a
dic_chr.setdefault(temp_chr, [0, 0])
else:
dic_chr[temp_chr][0] += len(line)
dic_chr[temp_chr][1] += (line.count("N") + line.count("n"))
fileinput.close()
# Create DataFrame with explicit column names
df = pd.DataFrame.from_dict(dic_chr, orient='index', columns=['length', 'nonNlen'])
df = df.reset_index()
df.columns = ['index', 'length', 'nonNlen']
# Convert columns to appropriate types before assignment
df['length'] = df['length'].astype(str)
df['nonNlen'] = df['nonNlen'].astype(str)
for i in range(0, len(df)):
df.iloc[i, 1] = "/len=" + df.iloc[i, 1]
df.iloc[i, 2] = "/nonNlen=" + df.iloc[i, 2]
df["species"] = "/org=" + species
# Save to text file
df.to_csv(species + '.fa.hdrs.txt', sep=' ', header=False, index=False, quoting=3)
hdrs处理脚本
最新推荐文章于 2024-09-15 22:31:42 发布