#multiple sequence alignment object#We have two functions for reading in sequence alignments, Bio.AlignIO.read() and Bio.AlignIO.parse()#Using Bio.AlignIO.parse() will return an iterator which gives MultipleSeqAlignment objects.#the Bio.AlignIO.read() function which returns a single MultipleSeqAlignment object.from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.sth","stockholm")#handle of filename && fileformatprint(alignment)#You’ll notice in the above output the sequences have been truncatedprint("Alignment length %i"% alignment.get_alignment_length())#Alignment lengthfor record in alignment:print("%s - %s"%(record.seq, record.id))#每一个都是SeqRecord
#In general however, files can contain more than one alignment, and to read these files we must use the Bio.AlignIO.parse() functionfrom Bio import AlignIO
alignments = AlignIO.parse("resampled.phy","phylip")for alignment in alignments:print(alignment)print()# keep all the alignments in memory at once, which will allow you to access them in any order, then turn the iterator into a listfrom Bio import AlignIO
alignments =list(AlignIO.parse("resampled.phy","phylip"))
last_align = alignments[-1]
first_align = alignments[0]# seq_count 指定每多少个seqRecord认为是一个alignment
AlignIO.parse(handle,"fasta", seq_count=2)
Writing Alignments
#Writing Alignments# Bio.AlignIO.write() which is for alignment output (writing files). This is a function taking three arguments: some MultipleSeqAlignment # objects (or for backwards compatibility the obsolete Alignment objects), a handle or filename to write to, and a sequence format.from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO
align1 = MultipleSeqAlignment([SeqRecord(Seq("ACTGCTAGCTAG"),id="Alpha"),SeqRecord(Seq("ACT-CTAGCTAG"),id="Beta"), SeqRecord(Seq("ACTGCTAGDTAG"),id="Gamma"),])
align2 = MultipleSeqAlignment([SeqRecord(Seq("GTCAGC-AG"),id="Delta"),SeqRecord(Seq("GACAGCTAG"),id="Epsilon"),SeqRecord(Seq("GTCAGCTAG"),id="Zeta"),])
align3 = MultipleSeqAlignment([SeqRecord(Seq("ACTAGTACAGCTG"),id="Eta"),SeqRecord(Seq("ACTAGTACAGCT-"),id="Theta"),SeqRecord(Seq("-CTACTACAGGTG"),id="Iota"),])
my_alignments =[align1, align2, align3]
AlignIO.write(my_alignments,"my_example.phy","phylip")
#Converting between sequence alignment file formats#use the Bio.AlignIO.convert() helper function.from Bio import AlignIO
count = AlignIO.convert("PF05371_seed.sth","stockholm","PF05371_seed.aln","clustal")# Or, using Bio.AlignIO.parse() and Bio.AlignIO.write():
alignments = AlignIO.parse("PF05371_seed.sth","stockholm")
count = AlignIO.write(alignments,"PF05371_seed.aln","clustal")
manipulating alignment result
#manipulating alignment#First of all, in some senses the alignment objects act like a Python list of SeqRecord objects (the rows). ********from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.sth","stockholm")print("Number of rows: %i"%len(alignment))# Number of rowsfor record in alignment:print("%s - %s"%(record.seq, record.id))#also use the list-like append and extend methods to add more rows to the alignment (as SeqRecord objects)# slicingprint(alignment)print(alignment[3:7])#Alignment with 4 rowsprint(alignment[2,6])print(alignment[:,6])#You can pull out a single column as a string like thisprint(alignment[3:6,:6])print(alignment[:,9:])#concate
alignment[:,:6]+ alignment[:,9:]
# Alignments as arraysimport numpy as np
from Bio import AlignIO
alignment = AlignIO.read("PF05371_seed.sth","stockholm")
align_array = np.array([list(rec)for rec in alignment], np.character)print("Array shape %i by %i"% align_array.shape)
# Alignment Tools -- ClusterW :multiple sequence alignment tool import Bio.Align.Applications
dir(Bio.Align.Applications)# doctest:+ELLIPSIS['ClustalOmegaCommandline','ClustalwCommandline','DialignCommandline','MSAProbsCommandline',]from Bio.Align.Applications import ClustalwCommandline
# help(ClustalwCommandline)#By default ClustalW generate an alignment and guide tree file with names based on the input FASTA file, opuntia.aln and opuntia.dnd
cline = ClustalwCommandline("clustalw2", infile="opuntia.fasta")#clustalw2 indicating we have version two installed from Bio import AlignIO
align = AlignIO.read("opuntia.aln","clustal")print(align)#the opuntia.dnd file ClustalW creates is just a standard Newick tree file, and Bio.Phylo can parsefrom Bio import Phylo
tree = Phylo.read("opuntia.dnd","newick")
Phylo.draw_ascii(tree)
# Alignment Tools -- MUSCLE :multiple sequence alignment tool from Bio.Align.Applications import MuscleCommandline
#help(MuscleCommandline)
cline = MuscleCommandline(input="opuntia.fasta", out="opuntia.txt")print(cline)#By default MUSCLE will output the alignment as a FASTA file (using gapped sequences)
MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", clw=True)#You can also ask for ClustalW-like output
MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", clwstrict=True)# strict ClustalW output where the original ClustalW header line is used for maximum compatibility# or using HTML which would give a human readable web page
MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", html=True)
muscle -in opuntia.fasta -out opuntia.txt
# MUSCLE using stdoutfrom Bio.Align.Applications import MuscleCommandline
muscle_cline = MuscleCommandline(input="opuntia.fasta")
stdout, stderr = muscle_cline()#都会缓存在内存中哦from io import StringIO
from Bio import AlignIO
align = AlignIO.read(StringIO(stdout),"fasta")print(align)
Pairwise sequence alignment
# Pairwise sequence alignment#Pairwise sequence alignment is the process of aligning two sequences to each other by optimizing the similarity score between them#Biopython includes two built-in pairwise aligners: the ’old’ Bio.pairwise2 module and the new PairwiseAligner class within the Bio.Align modulefrom Bio import pairwise2
from Bio import SeqIO
seq1 = SeqIO.read("alpha.faa","fasta")
seq2 = SeqIO.read("beta.faa","fasta")
alignments = pairwise2.align.globalxx(seq1.seq, seq2.seq)#The tricky part are the last two letters of the function name (here: xx), which are used for decoding the scores and penalties for matches #(and mismatches) and gaps. The first letter decodes the match score, e.g. x means that a match counts 1 while mismatches have no costs.#The second letter decodes the cost for gaps; x means no gap costs at all, with s different penalties for opening and extending a gap can be assigned.len(alignments)# alignments now contains a list of alignments# Each alignment is a named tuple consisting of the two aligned sequences, the score, the start and the end positions of the alignment # Bio.pairwise2 has a function format_alignment for a nicer printout:print(pairwise2.format_alignment(*alignments[0]))# doctest:+ELLIPSIS
pairwise2 global alignments with punishment
## global alignments with punishmentfrom Bio import pairwise2
from Bio import SeqIO
from Bio.Align import substitution_matrices
blosum62 = substitution_matrices.load("BLOSUM62")
seq1 = SeqIO.read("alpha.faa","fasta")
seq2 = SeqIO.read("beta.faa","fasta")
alignments = pairwise2.align.globalds(seq1.seq, seq2.seq, blosum62,-10,-0.5)len(alignments)print(pairwise2.format_alignment(*alignments[0]))# Local alignments are called similarly with the function align.localXX, where again XX stands for a two letter code for the match and gap functions:from Bio import pairwise2
from Bio.Align import substitution_matrices
blosum62 = substitution_matrices.load("BLOSUM62")
alignments = pairwise2.align.localds("LSPADKTNVKAA","PEEKSAV", blosum62,-10,-1)print(pairwise2.format_alignment(*alignments[0]))print(pairwise2.format_alignment(*alignments[0], full_sequences=True))# non- aligned parts of the sequences will print