基于此系列文章的最终代码:
DNA Toolkit Part 1: Validating and counting nucleotides
[ 共六篇 ]
dna_toolkit.py
:处理 DNA / RNA / 密码子 数据的一系列函数的存放文件,包括 structures.py 。
structures.py
:存放 DNA,RNA,密码子,蛋白质等数据的文件。
main.py
:用于测试代码的文件,包括 dna_toolkit.py 。
dna_toolkit.py
from collections import Counter
from structures import *
def validate_seq(seq): # 检查序列,确保有效的DNA字符串
tmpseq = seq.upper()
for nuc in tmpseq:
if nuc not in DNA_Nucleotides:
return False
return tmpseq
def nucleotide_frequency(seq): # 计算每个核苷酸的数量
tmpFreqDict = {"A": 0, "C": 0, "G": 0, "T": 0}
for nuc in seq:
tmpFreqDict[nuc] += 1
return tmpFreqDict # 返回字典
def reverse_complement(seq): # 补,反
# return ''.join([DNA_ReverseComplement[nuc] for nuc in seq])[::-1]
mapping = str.maketrans('ATCG', 'TAGC')
return seq.translate(mapping)[::-1]
def transcription(seq): # DNA -> RNA 转录
return seq.replace("T", "U")
def gc_content(seq): # GC含量
return round((seq.count('C') + seq.count('G')) / len(seq) * 100)
def gc_content_subsec(seq, k=5): # GC含量在长度为k的子串
res = []
for i in range(0, len(seq) - k + 1, k):
subseq = seq[i:i + k]
res.append(gc_content(subseq))
return res
def translate_seq(seq, init_pos=0): # 转化为氨基酸链(多肽链)
return [
DNA_Codons[seq[pos:pos + 3]]
for pos in range(init_pos, len(seq) - 2, 3)
]
def codon_usage(seq, aminoacid): # 编码DNA序列中给定氨基酸的每个密码子的频率。
tmpList = []
for i in range(0, len(seq) - 2, 3):
if DNA_Codons[seq[i:i + 3]] == aminoacid:
tmpList.append(seq[i:i + 3])
freqDict = dict(Counter(tmpList))
totalWight = sum(freqDict.values())
for seq in freqDict:
freqDict[seq] = round(freqDict[seq] / totalWight, 2)
return freqDict
def gen_reading_frames(seq): # 生成DNA序列的六个氨基酸链(多肽链)
frames = []
frames.append(translate_seq(seq, 0))
frames.append(translate_seq(seq, 1))
frames.append(translate_seq(seq, 2))
frames.append(translate_seq(reverse_complement(seq), 0))
frames.append(translate_seq(reverse_complement(seq), 1))
frames.append(translate_seq(reverse_complement(seq), 2))
return frames
def proteins_from_rf(aa_seq): # 计算氨基酸链中所有可能的蛋白质,返回一个可能的蛋白质列表
current_prot = []
proteins = []
for aa in aa_seq:
if aa == "_":
# STOP accumulating amino acids if _ - STOP was found
if current_prot:
for p in current_prot:
proteins.append(p)
current_prot = []
else:
# START accumulating amino acids if M - START was found
if aa == "M":
current_prot.append("")
for i in range(len(current_prot)):
current_prot[i] += aa
return proteins
def all_proteins_from_orfs(seq, startReadPos=0, endReadPos=0, ordered=False): # 总结
if endReadPos > startReadPos:
rfs = gen_reading_frames(seq[startReadPos: endReadPos]) # 部分
else:
rfs = gen_reading_frames(seq) # 全部
res = []
for rf in rfs:
prots = proteins_from_rf(rf)
for p in prots:
res.append(p)
if ordered: # 是否进行排序
return sorted(res, key=len, reverse=True)
return res
structures.py
DNA_Nucleotides = ['A', 'C', 'G', 'T'] # DNA_核苷酸
DNA_ReverseComplement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
DNA_Codons = {
# 'M' - START, '_' - STOP
"GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
"TGT": "C", "TGC": "C",
"GAT": "D", "GAC": "D",
"GAA": "E", "GAG": "E",
"TTT": "F", "TTC": "F",
"GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
"CAT": "H", "CAC": "H",
"ATA": "I", "ATT": "I", "ATC": "I",
"AAA": "K", "AAG": "K",
"TTA": "L", "TTG": "L", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
"ATG": "M",
"AAT": "N", "AAC": "N",
"CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
"CAA": "Q", "CAG": "Q",
"CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", "AGG": "R",
"TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S", "AGC": "S",
"ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
"GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
"TGG": "W",
"TAT": "Y", "TAC": "Y",
"TAA": "_", "TAG": "_", "TGA": "_"
}
# NCBI database; FASTA formatted
# NM_001185098.1 Homo sapiens insulin (INS), transcript variant 3, mRNA
NM_001185098_1 = '\
AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGGTCTGTTCCAAGGGCCTTTGCGTCAGGT\
GGGCTCAGGATTCCAGGGTGGCTGGACCCCAGGCCCCAGCTCTGCAGCAGGGAGGACGTGGCTGGGCTCG\
TGAAGCATGTGGGGGTGAGCCCAGGGGCCCCAAGGCAGGGCACCTGGCCTTCAGCCTGCCTCAGCCCTGC\
CTGTCTCCCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTG\
GCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAG\
CTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCT\
GCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGCAGCCCTTGGCCCTGGAGGGG\
TCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTACCAGCATCTGCTCCCTCTACCAGCTGGAGAACT\
ACTGCAACTAGACGCAGCCCGCAGGCAGCCCCACACCCGCCGCCTCCTGCACCGAGAGAGATGGAATAAA\
GCCCTTGAACCAGCAAAA'
main.py
from dna_toolkit import *
import random
# 创建一个随机DNA序列
randDNAStr = ''.join([random.choice(DNA_Nucleotides) for nuc in range(50)])
DNAStr = validate_seq(randDNAStr)
print(f'\nSequence: {DNAStr}\n')
print(f'[1] + Sequence Length: {len(DNAStr)}\n')
print(f'[2] + Nucleotide Frequency: {nucleotide_frequency(DNAStr)}\n')
print(f'[3] + DNA/RNA Transcription: {transcription(DNAStr)}\n')
print(f"[4] + DNA String + Complement + Reverse Complement:\n5' {DNAStr} 3'")
print(f" {''.join(['|' for c in range(len(DNAStr))])}")
print(f"3' {reverse_complement(DNAStr)[::-1]} 5' [Complement]")
print(f"5' {reverse_complement(DNAStr)} 3' [Rev. Complement]\n")
print(f'[5] + GC Content: {gc_content(DNAStr)}%\n')
print(f'[6] + GC Content in Subsection k=5: {gc_content_subsec(DNAStr, k=5)}\n')
print(f'[7] + Aminoacids Sequence from DNA: {translate_seq(DNAStr, 0)}\n')
print(f'[8] + Codon frequency (L): {codon_usage(DNAStr, "L")}\n')
print('[9] + Reading_frames:')
for frames in gen_reading_frames(DNAStr):
print(frames)
# print('\n[10] + All prots in 6 open reading frames:')
# for prot in all_proteins_from_orfs(DNAStr, 0, 0, True):
# print(f'{prot}')
print('\n[10] + All prots in 6 open reading frames:')
for prot in all_proteins_from_orfs(NM_001185098_1, 0, 0, True):
print(f'{prot}')
执行结果:
Sequence: CCGTCCTCTCGCTCCCTGACATTTCTACGGTGGCGCTATGAGAGTTAGTC
[1] + Sequence Length: 50
[2] + Nucleotide Frequency: {'A': 7, 'C': 16, 'G': 12, 'T': 15}
[3] + DNA/RNA Transcription: CCGUCCUCUCGCUCCCUGACAUUUCUACGGUGGCGCUAUGAGAGUUAGUC
[4] + DNA String + Complement + Reverse Complement:
5' CCGTCCTCTCGCTCCCTGACATTTCTACGGTGGCGCTATGAGAGTTAGTC 3'
||||||||||||||||||||||||||||||||||||||||||||||||||
3' GGCAGGAGAGCGAGGGACTGTAAAGATGCCACCGCGATACTCTCAATCAG 5' [Complement]
5' GACTAACTCTCATAGCGCCACCGTAGAAATGTCAGGGAGCGAGAGGACGG 3' [Rev. Complement]
[5] + GC Content: 56%
[6] + GC Content in Subsection k=5: [80, 60, 80, 60, 20, 60, 80, 40, 40, 40]
[7] + Aminoacids Sequence from DNA: ['P', 'S', 'S', 'R', 'S', 'L', 'T', 'F', 'L', 'R', 'W', 'R', 'Y', 'E', 'S', '_']
[8] + Codon frequency (L): {'CTG': 0.5, 'CTA': 0.5}
[9] + Reading_frames:
['P', 'S', 'S', 'R', 'S', 'L', 'T', 'F', 'L', 'R', 'W', 'R', 'Y', 'E', 'S', '_']
['R', 'P', 'L', 'A', 'P', '_', 'H', 'F', 'Y', 'G', 'G', 'A', 'M', 'R', 'V', 'S']
['V', 'L', 'S', 'L', 'P', 'D', 'I', 'S', 'T', 'V', 'A', 'L', '_', 'E', 'L', 'V']
['D', '_', 'L', 'S', '_', 'R', 'H', 'R', 'R', 'N', 'V', 'R', 'E', 'R', 'E', 'D']
['T', 'N', 'S', 'H', 'S', 'A', 'T', 'V', 'E', 'M', 'S', 'G', 'S', 'E', 'R', 'T']
['L', 'T', 'L', 'I', 'A', 'P', 'P', '_', 'K', 'C', 'Q', 'G', 'A', 'R', 'G', 'R']
[10] + All prots in 6 open reading frames:
MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
MRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN
MLVQHCSTMPRFCRDPSRAKGCRLPAPGPPPSSTCPTCRSSASRRVLGV
MPRFCRDPSRAKGCRLPAPGPPPSSTCPTCRSSASRRVLGV
MLHEPSHVLPAAELGPGVQPPWNPEPT
MAEGQ
MWG
ME
Process finished with exit code 0