Biopython---Sequence Handling

https://nbviewer.org/github/cgoliver/Notebooks/blob/master/COMP_364/L25/L25.ipynb
https://www.youtube.com/watch?v=qQ7rIpB4oOw&list=PLfclbddrqrdm

BioPython

  • Sequence Handling
  • 3D Structure
  • Population Genetics

BioPython.Seq

#let's make a generic sequence

from Bio.Seq import Seq

my_seq = Seq("CCCGGAGAGA")
print(type(my_seq))
#let's see what attributes this object has
attributes = [a for a in dir(my_seq) if not a.startswith("_")]
print(attributes)

my_gene = Seq("ACTAGCAGCGGA")
#get the mRNA
my_transcript = my_gene.transcribe()
print(my_transcript)

#get the protein from the mRNA
my_protein = my_transcript.translate()
print(my_protein)

#You can also go straight from the DNA to the protein.
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
myprot = coding_dna.translate()
print(myprot)
#As you can see, we got some STOP codons represented as * and translation continued.

#We can get translation to actually stop when it encounters a STOP codon.
myprot = coding_dna.translate(to_stop=True)
print(myprot)

#We can be even more realistic and only allow translation of valid genes (i.e. with a valid start and stop codon and proper number of bases).
#This is done by setting the cds=True keyword argument, which stands for "coding sequence".If we don't have a valid coding sequence, we get an exception.
myprot = coding_dna.translate(cds=True)
gene = Seq("ATGGCCATTGTAATGTAG")
gene.translate(cds=True)

General sequence methods

#There are some convenient operations for dealing with sequences.We can concatenate sequences if they are of matching type.
seq1 = Seq("AAACGGA")
seq2 = Seq("GGAGAT")
seq1 + seq2

#We can also index and slice as though we had strings.
seq1[:2] + seq2[-1]

#There is another type of object called a MutableSeq. If we want to support mutability we can convert a Seq object to a MutableSeq object quite easily.
mut_seq = seq1.tomutable()
mut_seq
mut_seq[0] = "G"
print(mut_seq)

#We can also do searching inside sequences.
myseq = Seq("CCAGAAACCCGGAA")

#find the first occurence of the pattern
print(myseq.find("GAA"))

#find the number of non-overlapping occurrences of a pattern
print(myseq.count("GAA"))

Database Searching

#The qblast method from the Bio.Blast.NCBIWWW module essentially sends our sequence to the BLAST web server.

#Here we are using the "nucleotice BLAST" algorithm so we say blastn and we are using it on the database of all nucleotide sequences, called nt.

from Bio.Blast import NCBIWWW
result_handle = NCBIWWW.qblast("blastn", "nt", Seq("AAAAGGAGAGAGAGTTTATA"))

from Bio.Blast import NCBIXML
blast_records = NCBIXML.parse(result_handle)
#We get an "iterator" of BLAST record objects, or "search results". We can now loop over each of our search results and print some information.



#The alignment attribute itself has other attributes like the query sequence, the length and title of the matching organism..

for b in blast_records:
    for alignment in b.alignments:
        for hsp in alignment.hsps:
            print('****Alignment****')
            print('sequence:', alignment.title)
            print('length:', alignment.length)
            print('e value:', hsp.expect)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')


SeqRecord and SeqIO

from Bio import SeqIO

records = SeqIO.parse("plague.fa", "fasta")
for r in records:
    print(type(r))
    print([a for a in dir(r) if not a.startswith("_")])

record = SeqIO.read(“single_plague.fa”, “fasta”)
#The SeqRecords object has some useful attributes.
ID: NZ_ADRZ01000932.1
Name: NZ_ADRZ01000932.1
Description: NZ_ADRZ01000932.1 Yersinia pestis biovar Antiqua str. E1979001 Contig_E1979001_19275, whole genome shotgun sequence
Number of features: 0
Seq(‘CTCTCCCAGCTGAGCTATAGCCCCAATGCGCACATAATAAATCGTGTGAACGGG…AGC’, SingleLetterAlphabet())

print(record.seq)
CTCTCCCAGCTGAGCTATAGCCCCAATGCGCACATAATAAATCGTGTGAACGGGGCGCATGATATGAGACCCCCGAAACTGTGTCAACGGCTAAATCGATTTCTCGTGTTAAGCGCTGAAAAAGCGGCCAAATCAGCCTGCAAATAACATAATAAGTGGAATGATGTTCACAAATTTGTTGTCACACCGCTGCTGTTATCAAATATAATAAATATCCTCCGGCATAGC
print(record.id)
NZ_ADRZ01000932.1
print(record.description)
NZ_ADRZ01000932.1 Yersinia pestis biovar Antiqua str. E1979001 Contig_E1979001_19275, whole genome shotgun sequence

We can also create our own SeqRecord objects.

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
                       IUPAC.protein),
                   id="YP_025292.1", name="HokC",
                   description="toxic membrane protein, small")
print(record)
record.format("fasta")
with open("myfasta.fa", "w") as f:
    f.write(record.format("fasta"))

#https://biopython.org/wiki/Alphabet

Making Multiple Sequence Alignments and Phylogenetic Trees

Step 1: Obtain some sequences and align them

import random

#generate some random sequences based on a random seed sequence
mut = 0.1
BASES = {"A", "C", "G", "T"}
seqs = []
seed = "".join([random.choice(list(BASES)) for _ in range(25 + random.choice([-1, 0, 1]))])

seq_records = []

for i in range(20):
    new_seq = "".join([b if random.random() > mut else random.choice(list(BASES - {b}))\
        for b in seed])
    #create a seq record
    seq_records.append(SeqRecord(Seq(new_seq), id=str(i)))
    
print(seq_records)
#write our sequences to fasta format
with open("seqs.fasta", "w") as f:
    SeqIO.write(seq_records, f, "fasta")

Step 2: Align the sequences
let’s just use an online tool which takes a input a fasta file which we just created.(https://www.ebi.ac.uk/Tools/msa/muscle/)

Step 3: Load the resulting alignment

#open the alignmnent file
from Bio import AlignIO
with open("aln.clustal", "r") as aln:
    #use AlignIO to read the alignment file in 'clustal' format
    alignment = AlignIO.read(aln, "clustal")

Step 4: Use the alignment to obtain a “distance” between all pairs of sequences

from Bio.Phylo.TreeConstruction import DistanceCalculator

#calculate the distance matrix
calculator = DistanceCalculator('identity')
#adds distance matrix to the calculator object and returns it
dm = calculator.get_distance(alignment)
print(dm)

Step 5: Construct a tree from the set of distances

from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

#initialize a DistanceTreeConstructor object based on our distance calculator object
constructor = DistanceTreeConstructor(calculator)

#build the tree
upgma_tree = constructor.build_tree(alignment)

from Bio import Phylo
import pylab
#draw the tree
Phylo.draw(upgma_tree)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值