alignment object and alignment tools

39 篇文章 1 订阅
11 篇文章 0 订阅

multiple sequence alignment object

#multiple sequence alignment object
#We have two functions for reading in sequence alignments, Bio.AlignIO.read() and Bio.AlignIO.parse()
#Using Bio.AlignIO.parse() will return an iterator which gives MultipleSeqAlignment objects.
#the Bio.AlignIO.read() function which returns a single MultipleSeqAlignment object.

from Bio import AlignIO

alignment = AlignIO.read("PF05371_seed.sth", "stockholm")  #handle of filename  &&  fileformat
print(alignment) #You’ll notice in the above output the sequences have been truncated
print("Alignment length %i" % alignment.get_alignment_length()) #Alignment length

for record in alignment:
    print("%s - %s" % (record.seq, record.id)) #每一个都是SeqRecord    
#In general however, files can contain more than one alignment, and to read these files we must use the Bio.AlignIO.parse() function
from Bio import AlignIO

alignments = AlignIO.parse("resampled.phy", "phylip")
for alignment in alignments:
    print(alignment)
    print()
    
# keep all the alignments in memory at once, which will allow you to access them in any order, then turn the iterator into a list
from Bio import AlignIO

alignments = list(AlignIO.parse("resampled.phy", "phylip"))
last_align = alignments[-1]
first_align = alignments[0]

# seq_count 指定每多少个seqRecord认为是一个alignment
AlignIO.parse(handle, "fasta", seq_count=2)

Writing Alignments

#Writing Alignments
# Bio.AlignIO.write() which is for alignment output (writing files). This is a function taking three arguments: some MultipleSeqAlignment 
# objects (or for backwards compatibility the obsolete Alignment objects), a handle or filename to write to, and a sequence format.
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO

align1 = MultipleSeqAlignment([SeqRecord(Seq("ACTGCTAGCTAG"), id="Alpha"),SeqRecord(Seq("ACT-CTAGCTAG"), id="Beta"), SeqRecord(Seq("ACTGCTAGDTAG"), id="Gamma"),])
align2 = MultipleSeqAlignment([SeqRecord(Seq("GTCAGC-AG"), id="Delta"),SeqRecord(Seq("GACAGCTAG"), id="Epsilon"),SeqRecord(Seq("GTCAGCTAG"), id="Zeta"),])
align3 = MultipleSeqAlignment([SeqRecord(Seq("ACTAGTACAGCTG"), id="Eta"),SeqRecord(Seq("ACTAGTACAGCT-"), id="Theta"),SeqRecord(Seq("-CTACTACAGGTG"), id="Iota"),])
my_alignments = [align1, align2, align3]

AlignIO.write(my_alignments, "my_example.phy", "phylip")
#Converting between sequence alignment file formats
#use the Bio.AlignIO.convert() helper function.
from Bio import AlignIO

count = AlignIO.convert("PF05371_seed.sth", "stockholm", "PF05371_seed.aln", "clustal")

# Or, using Bio.AlignIO.parse() and Bio.AlignIO.write():
alignments = AlignIO.parse("PF05371_seed.sth", "stockholm")
count = AlignIO.write(alignments, "PF05371_seed.aln", "clustal")

manipulating alignment result

#manipulating alignment
#First of all, in some senses the alignment objects act like a Python list of SeqRecord objects (the rows). ********
from Bio import AlignIO

alignment = AlignIO.read("PF05371_seed.sth", "stockholm")
print("Number of rows: %i" % len(alignment)) # Number of rows

for record in alignment:
    print("%s - %s" % (record.seq, record.id))
    
#also use the list-like append and extend methods to add more rows to the alignment (as SeqRecord objects)
# slicing
print(alignment)
print(alignment[3:7]) #Alignment with 4 rows
print(alignment[2, 6])
print(alignment[:, 6]) #You can pull out a single column as a string like this
print(alignment[3:6, :6]) 
print(alignment[:, 9:])

#concate
alignment[:, :6] + alignment[:, 9:]
# Alignments as arrays
import numpy as np
from Bio import AlignIO

alignment = AlignIO.read("PF05371_seed.sth", "stockholm")
align_array = np.array([list(rec) for rec in alignment], np.character)

print("Array shape %i by %i" % align_array.shape)

Alignment Tools – ClusterW :multiple sequence alignment tool

# Alignment Tools -- ClusterW :multiple sequence alignment tool 
import Bio.Align.Applications

dir(Bio.Align.Applications) # doctest:+ELLIPSIS
['ClustalOmegaCommandline', 'ClustalwCommandline', 'DialignCommandline', 'MSAProbsCommandline',]

from Bio.Align.Applications import ClustalwCommandline

# help(ClustalwCommandline)
#By default ClustalW generate an alignment and guide tree file with names based on the input FASTA file, opuntia.aln and opuntia.dnd
cline = ClustalwCommandline("clustalw2", infile="opuntia.fasta") #clustalw2 indicating we have version two installed 

from Bio import AlignIO

align = AlignIO.read("opuntia.aln", "clustal")
print(align)

#the opuntia.dnd file ClustalW creates is just a standard Newick tree file, and Bio.Phylo can parse
from Bio import Phylo

tree = Phylo.read("opuntia.dnd", "newick")
Phylo.draw_ascii(tree)

Alignment Tools – MUSCLE :multiple sequence alignment tool

# Alignment Tools -- MUSCLE :multiple sequence alignment tool 
from Bio.Align.Applications import MuscleCommandline

#help(MuscleCommandline)
cline = MuscleCommandline(input="opuntia.fasta", out="opuntia.txt")
print(cline)

#By default MUSCLE will output the alignment as a FASTA file (using gapped sequences)
MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", clw=True) #You can also ask for ClustalW-like output
MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", clwstrict=True) # strict ClustalW output where the original ClustalW header line is used for maximum compatibility
# or using HTML which would give a human readable web page
MuscleCommandline(input="opuntia.fasta", out="opuntia.aln", html=True)
muscle -in opuntia.fasta -out opuntia.txt
# MUSCLE using stdout
from Bio.Align.Applications import MuscleCommandline

muscle_cline = MuscleCommandline(input="opuntia.fasta")
stdout, stderr = muscle_cline()  #都会缓存在内存中哦

from io import StringIO
from Bio import AlignIO

align = AlignIO.read(StringIO(stdout), "fasta")
print(align)

Pairwise sequence alignment

# Pairwise sequence alignment
#Pairwise sequence alignment is the process of aligning two sequences to each other by optimizing the similarity score between them
#Biopython includes two built-in pairwise aligners: the ’old’ Bio.pairwise2 module and the new PairwiseAligner class within the Bio.Align module
from Bio import pairwise2
from Bio import SeqIO

seq1 = SeqIO.read("alpha.faa", "fasta")
seq2 = SeqIO.read("beta.faa", "fasta")
alignments = pairwise2.align.globalxx(seq1.seq, seq2.seq)
#The tricky part are the last two letters of the function name (here: xx), which are used for decoding the scores and penalties for matches 
#(and mismatches) and gaps. The first letter decodes the match score, e.g. x means that a match counts 1 while mismatches have no costs.
#The second letter decodes the cost for gaps; x means no gap costs at all, with s different penalties for opening and extending a gap can be assigned.

len(alignments) # alignments now contains a list of alignments

# Each alignment is a named tuple consisting of the two aligned sequences, the score, the start and the end positions of the alignment 
# Bio.pairwise2 has a function format_alignment for a nicer printout:
print(pairwise2.format_alignment(*alignments[0])) # doctest:+ELLIPSIS

pairwise2 global alignments with punishment

## global alignments with punishment
from Bio import pairwise2
from Bio import SeqIO
from Bio.Align import substitution_matrices

blosum62 = substitution_matrices.load("BLOSUM62")
seq1 = SeqIO.read("alpha.faa", "fasta")
seq2 = SeqIO.read("beta.faa", "fasta")
alignments = pairwise2.align.globalds(seq1.seq, seq2.seq, blosum62, -10, -0.5)
len(alignments)
print(pairwise2.format_alignment(*alignments[0]))

# Local alignments are called similarly with the function align.localXX, where again XX stands for a two letter code for the match and gap functions:
from Bio import pairwise2
from Bio.Align import substitution_matrices

blosum62 = substitution_matrices.load("BLOSUM62")
alignments = pairwise2.align.localds("LSPADKTNVKAA", "PEEKSAV", blosum62, -10, -1)
print(pairwise2.format_alignment(*alignments[0]))
print(pairwise2.format_alignment(*alignments[0], full_sequences=True)) # non- aligned parts of the sequences will print

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值