#parse or read sequence
Bio.SeqIO.parse("path_to_file","file_formate")#通常用于解析记录多个SeqRecord的文件,并返回一个可迭代对象
Bio.SeqIO.read("path_to_file","file_formate")#通常用于解析只记录单个SeqRecord的文件# or pass he handle to parsewithopen("ls_orchid.gbk")as handle:
SeqIO.parse(handle,"gb")
读取genbank文件
record_iterator = SeqIO.parse("ls_orchid.gbk","genbank")# one for item in record_iterator:
pirnt(item)# two
first_record =next(record_iterator)
second_record =next(record_iterator)# three
record_list =[record for record in record_iterator]# or
record_list =list(SeqIO.parse("ls_orchid.gbk","genbank"))#得到SeqRecord形式的数据后,就可以按照上一届的学习进行数据的获取和操作了
seqrecord.id
seqrecord.seq
seqrecord.description
seqrecord.name
seqrecord.annotations
seqrecord.annotations.keys()
seqrecord.annotations.values()
seqrecord.annotations["source"]
parse sequence from compressed files
#parse sequence from compressed filesimport gzip
from Bio import SeqIO
with gzip.open("ls_orchid.gbk.gz","rt")as handle:
SeqIO.parse(handle,"gb")import bz2
from Bio import SeqIO
with bz2.open("ls_orchid.gbk.bz2","rt")as handle:
SeqIO.parse(handle,"gb")
parse sequence from the net
#parse sequence from the netfrom Bio import Entrez
from Bio import SeqIO
Entrez.email ="A.N.Other@example.com"with Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text",id="6273291")as handle:
seq_record = SeqIO.read(handle,"fasta")print("%s with %i features"%(seq_record.id,len(seq_record.features)))
AF191665.1 with 0 features
from Bio import Entrez
from Bio import SeqIO
Entrez.email ="A.N.Other@example.com"with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text",id="6273291")as handle:
seq_record = SeqIO.read(handle,"gb")# using "gb" as an alias for "genbank"print("%s with %i features"%(seq_record.id,len(seq_record.features)))
AF191665.1 with 3 features
from Bio import Entrez
from Bio import SeqIO
Entrez.email ="A.N.Other@example.com"with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text",id="6273291,6273290,6273289")as handle:for seq_record in SeqIO.parse(handle,"gb"):print("%s %s..."%(seq_record.id, seq_record.description[:50]))print("Sequence length %i, %i features, from: %s"%(len(seq_record),len(seq_record.features),seq_record.annotations["source"],))
#parse SwissProt sequences from the netfrom Bio import ExPASy
from Bio import SeqIO
with ExPASy.get_sprot_raw("O23729")as handle:
seq_record = SeqIO.read(handle,"swiss")print(seq_record.id)print(seq_record.name)print(seq_record.description)print(repr(seq_record.seq))print("Length %i"%len(seq_record))print(seq_record.annotations["keywords"])
parse sequence files as dictionaries
#sequence files as dictionaries#Sequence files as Dictionaries { In memory#This is very useful formoderately large files where you only need to access certain elements of the file, and makes for a nice quick’n dirty#database. #You can use the function Bio.SeqIO.to_dict() to make a SeqRecord dictionary (in memory).#By defaultthis will use each record’s identifier (i.e. the .id attribute) as the key.
orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.gbk","genbank"))list(orchid_dict.keys())#Under Python 3 the dictionary methods like \.keys()\ and \.values()\ are iterators rather than lists
seq_record = orchid_dict["Z78475.1"]
seq_record.description
seq_record.seq
#Specifying the dictionary keysdefget_accession(record):""""Given a SeqRecord, return the accession number as a string.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
parts = record.id.split("|")assertlen(parts)==5and parts[0]=="gi"and parts[2]=="emb"return parts[3]
orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.fasta","fasta"), key_function=get_accession)#自定义每个seqrecord的keyprint(orchid_dict.keys())
#sequence files as dictionaries#Sequence files as Dictionaries { In memory#For larger files you should consider Bio.SeqIO.index(),#it still returns a dictionary like object, this does not keep everything in memory. Instead, it just records where#each record is within the file { when you ask for a particular record, it then parses it on demand.
orchid_dict = SeqIO.index("ls_orchid.gbk","genbank")
orchid_dict.keys()
seq_record = orchid_dict["Z78475.1"]
seq_record.description
seq_record.seq
orchid_dict.close()#Note that Bio.SeqIO.index() won’t take a handle, but only a filename.#Specifying the dictionary keysdefget_acc(identifier):""""Given a SeqRecord identifier string, return the accession number as a string.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
parts = identifier.split("|")assertlen(parts)==5and parts[0]=="gi"and parts[2]=="emb"return parts[3]
orchid_dict = SeqIO.index("ls_orchid.fasta","fasta", key_function=get_acc)print(orchid_dict.keys())
#Sequence files as Dictionaries { Database indexed files#Bio.SeqIO.index_db(), which can work on even extremely largefiles since it stores the record information as a file on disk #(using an SQLite3 database) rather than in memory.#先去下载一些大文件
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl2.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl3.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl4.seq.gz
$ gunzip gbvrl*.seq.gz
import glob
from Bio import SeqIO
files = glob.glob("gbvrl*.seq")
gb_vrl = SeqIO.index_db("gbvrl.idx", files,"genbank")#在硬盘上存在一个类似于数据库的gbvrl.idx文件,下次重复这个步骤时只是load index file gbvrl#print("%i sequences indexed" % len(gb_vrl))print(gb_vrl["AB811634.1"].description)print(gb_vrl.get_raw("AB811634.1"))
write sequence files
#write sequence files#Bio.SeqIO.write() which is for sequence output
SeqIO.write(my_records,"my_example.faa","fasta")#some SeqRecord objects, a handle or filename to write to, and a sequence format
#Converting between sequence file formatsfrom Bio import SeqIO
records = SeqIO.parse("ls_orchid.gbk","genbank")
count = SeqIO.write(records,"my_example.fasta","fasta")# or
count = SeqIO.convert("ls_orchid.gbk","genbank","my_example.fasta","fasta")#无法做fastq to fasta
Low level FASTA and FASTQ parsers
#Low level FASTA and FASTQ parsersfrom Bio.SeqIO.FastaIO import SimpleFastaParser
count =0
total_len =0withopen("ls_orchid.fasta")as in_handle:for title, seq in SimpleFastaParser(in_handle):
count +=1
total_len +=len(seq)print("%i records with total sequence length %i"%(count, total_len))
out_handle.write(">%s\n%s\n"%(title, seq))#解析fastq文件的轻量级脚本from Bio.SeqIO.QualityIO import FastqGeneralIterator
count =0
total_len =0withopen("example.fastq")as in_handle:for title, seq, qual in FastqGeneralIterator(in_handle):
count +=1
total_len +=len(seq)print("%i records with total sequence length %i"%(count, total_len))
out_handle.write("@%s\n%s\n+\n%s\n"%(title, seq, qual))