1、Word frequency
计算ATCG在序列里的出现频率:
(n为个数,w为ATCG其中一个,L为序列长度)
例子:计算A的频率/含量
#导入必须的库
from Bio.Seq import Seq
genome = 'ATGACTACACGTATTTATTCATCAGTTACCACTTCAATTAGTGAGTTGAAAAAAAACCC'
Len = len(genome)#长度
print(Len)#59
print(type(Len))#
n_A = genome.count('A')
print(n_A)#22
print(type(n_A))#
f_A = float(n_A/Len)#不强制转换也是float
print(f_A)#0.3728813559322034
print(type(f_A))#
2、生成自己的fasta文件
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein
rec1 = SeqRecord(
Seq(
"MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
"SSAC",
generic_protein,
),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]",
)
rec2 = SeqRecord(
Seq(
"YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ"
"DMVVVEIPKLGKEAAVKAIKEWGQ",
generic_protein,
),
id="gi|13919613|gb|AAK33142.1|",
description="chalcone synthase [Fragaria vesca subsp. bracteata]",
)
rec3 = SeqRecord(
Seq(
"MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC"
"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP"
"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN"
"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV"
"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW"
"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT"
"TGEGLEWGVLFGFGPGLTVETVVLHSVAT",
generic_protein,
),
id="gi|13925890|gb|AAK49457.1|",
description="chalcone synthase [Nicotiana tabacum]",
)
from Bio import SeqIO
my_records = [rec1, rec2, rec3]
SeqIO.write(my_records, "my_example.faa", "fasta")
这是一个一个这样子输入生成的,那我要生成很多个怎么办,这里我根据自己的项目,写了一个for循环。不懂可以私信。
上面的rec1,rec2,rec3是biopython的类,我们不好操作,但是my_records其实是一个列表,这就很好办了,我们利用append,就可以循环生成一个长列表了。下面的代码是运行不了的,只是一个思路
for i in range(0,100):#假如我要生成100个序列
#seq_str是你的序列,类型是str,自己在这里赋值,id是你自己定的,为了区分我加了str(i)作为区分,然后没有加注释
rec = SeqRecord(Seq(seq_str),id="sample" + str(i))
#下面一行就是扩展列表
my_records.append(rec)
#最后直接写入文件
SeqIO.write(my_records, "./sample/genome_sample.fa", "fasta")