P52基因序列统计分析
下载P52基因序列,采用Phython, 设置扫描窗口,窗口长度size分别设为1,2,3,每次窗口移动的步长step=1, 统计输出单个字符,紧邻两个字符,紧邻三个字符,各自的数目和所占百分比。
如:A 的个数和百分比,AA 的个数和百分比,AAA 的个数和百分比,保存在本地文件。
from Bio import SeqIO
import pandas as pd
fa_seq = SeqIO.read("p53_datasets/ncbi_dataset/data/gene.fna", "fasta").seq
def find_seq(num):
L = 0
R = num
dict_count = {}
seq_list = []
while R < len(fa_seq):
L += 1
R += 1
if fa_seq[L:R] not in dict_count:
dict_count[fa_seq[L:R]] = 1
else:
dict_count[fa_seq[L:R]] += 1
for seq_key, seq_value in dict_count.items():
dict_seq = {}
dict_seq['seq'] = seq_key
dict_seq['count'] = seq_value
dict_seq['ratio'] = round(seq_value / (len(fa_seq) - num + 1), 3) # 保留3位小数
seq_list.append(dict_seq)
return seq_list
for i in range(1, 4):
ret = find_seq(i)
print(ret)
pf = pd.DataFrame(ret)
pf.to_csv('test2.csv', mode="a", index=False, header=True, encoding="utf-8")