Biopython -- SeqIO

39 篇文章 1 订阅
11 篇文章 0 订阅

from Bio import SeqIO
#parse or read sequence
Bio.SeqIO.parse("path_to_file","file_formate")  #通常用于解析记录多个SeqRecord的文件,并返回一个可迭代对象
Bio.SeqIO.read("path_to_file","file_formate")  #通常用于解析只记录单个SeqRecord的文件
# or pass he  handle to parse
with open("ls_orchid.gbk") as handle:
    SeqIO.parse(handle,"gb")

读取genbank文件

record_iterator = SeqIO.parse("ls_orchid.gbk","genbank")
# one 
for item in record_iterator:
    pirnt(item)
    
# two
first_record = next(record_iterator)
second_record = next(record_iterator)

# three
record_list = [record for record in record_iterator]
# or
record_list = list(SeqIO.parse("ls_orchid.gbk","genbank")) 

#得到SeqRecord形式的数据后,就可以按照上一届的学习进行数据的获取和操作了
seqrecord.id
seqrecord.seq
seqrecord.description
seqrecord.name
seqrecord.annotations
seqrecord.annotations.keys()
seqrecord.annotations.values()
seqrecord.annotations["source"]

parse sequence from compressed files

#parse sequence from compressed files
import gzip
from Bio import SeqIO
with gzip.open("ls_orchid.gbk.gz","rt") as handle:
    SeqIO.parse(handle,"gb")

import bz2
from Bio import SeqIO
with bz2.open("ls_orchid.gbk.bz2","rt") as handle:
    SeqIO.parse(handle,"gb")

parse sequence from the net

#parse sequence from the net
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id="6273291") as handle:
    seq_record = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))
AF191665.1 with 0 features
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id="6273291") as handle:
    seq_record = SeqIO.read(handle, "gb") # using "gb" as an alias for "genbank"
print("%s with %i features" % (seq_record.id, len(seq_record.features)))
AF191665.1 with 3 features
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id="6273291,6273290,6273289") as handle:
    for seq_record in SeqIO.parse(handle, "gb"):
        print("%s %s..." % (seq_record.id, seq_record.description[:50]))
        print("Sequence length %i, %i features, from: %s"% (len(seq_record),len(seq_record.features),seq_record.annotations["source"],))
AF191665.1 Opuntia marenae rpl16 gene; chloroplast gene for c...
Sequence length 902, 3 features, from: chloroplast Grusonia marenae
AF191664.1 Opuntia clavata rpl16 gene; chloroplast gene for c...
Sequence length 899, 3 features, from: chloroplast Grusonia clavata
AF191663.1 Opuntia bradtiana rpl16 gene; chloroplast gene for...
Sequence length 899, 3 features, from: chloroplast Grusonia bradtiana

parse SwissProt sequences from the net

#parse SwissProt sequences from the net
from Bio import ExPASy
from Bio import SeqIO
with ExPASy.get_sprot_raw("O23729") as handle:
    seq_record = SeqIO.read(handle, "swiss")
    print(seq_record.id)
    print(seq_record.name)
    print(seq_record.description)
    print(repr(seq_record.seq))
    print("Length %i" % len(seq_record))
    print(seq_record.annotations["keywords"])

parse sequence files as dictionaries

#sequence files as dictionaries
#Sequence files as Dictionaries { In memory
#This is very useful formoderately large files where you only need to access certain elements of the file, and makes for a nice quick’n dirty
#database. 
#You can use the function Bio.SeqIO.to_dict() to make a SeqRecord dictionary (in memory).
#By defaultthis will use each record’s identifier (i.e. the .id attribute) as the key.
orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.gbk", "genbank"))
list(orchid_dict.keys()) #Under Python 3 the dictionary methods like \.keys()\ and \.values()\ are iterators rather than lists
seq_record = orchid_dict["Z78475.1"]
seq_record.description
seq_record.seq
#Specifying the dictionary keys
def get_accession(record):
""""Given a SeqRecord, return the accession number as a string.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
    parts = record.id.split("|")
    assert len(parts) == 5 and parts[0] == "gi" and parts[2] == "emb"
    return parts[3]

orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.fasta", "fasta"), key_function=get_accession) #自定义每个seqrecord的key
print(orchid_dict.keys())
#sequence files as dictionaries
#Sequence files as Dictionaries { In memory
#For larger files you should consider Bio.SeqIO.index(),
#it still returns a dictionary like object, this does not keep everything in memory. Instead, it just records where
#each record is within the file { when you ask for a particular record, it then parses it on demand.
orchid_dict = SeqIO.index("ls_orchid.gbk", "genbank")
orchid_dict.keys()
seq_record = orchid_dict["Z78475.1"]
seq_record.description
seq_record.seq
orchid_dict.close()  #Note that Bio.SeqIO.index() won’t take a handle, but only a filename.

#Specifying the dictionary keys
def get_acc(identifier):
""""Given a SeqRecord identifier string, return the accession number as a string.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
    parts = identifier.split("|")
    assert len(parts) == 5 and parts[0] == "gi" and parts[2] == "emb"
    return parts[3]

orchid_dict = SeqIO.index("ls_orchid.fasta", "fasta", key_function=get_acc)
print(orchid_dict.keys())
#Sequence files as Dictionaries { Database indexed files
#Bio.SeqIO.index_db(), which can work on even extremely largefiles since it stores the record information as a file on disk 
#(using an SQLite3 database) rather than in memory.

#先去下载一些大文件
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl2.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl3.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl4.seq.gz
$ gunzip gbvrl*.seq.gz

import glob
from Bio import SeqIO
files = glob.glob("gbvrl*.seq")
gb_vrl = SeqIO.index_db("gbvrl.idx", files, "genbank") #在硬盘上存在一个类似于数据库的gbvrl.idx文件,下次重复这个步骤时只是load index file gbvrl
#print("%i sequences indexed" % len(gb_vrl))
print(gb_vrl["AB811634.1"].description)
print(gb_vrl.get_raw("AB811634.1"))

write sequence files

#write sequence files
#Bio.SeqIO.write() which is for sequence output 
SeqIO.write(my_records, "my_example.faa", "fasta") #some SeqRecord objects, a handle or filename to write to, and a sequence format
#Converting between sequence file formats
from Bio import SeqIO
records = SeqIO.parse("ls_orchid.gbk", "genbank")
count = SeqIO.write(records, "my_example.fasta", "fasta")

# or
count = SeqIO.convert("ls_orchid.gbk", "genbank", "my_example.fasta", "fasta")  #无法做fastq to fasta

Low level FASTA and FASTQ parsers

#Low level FASTA and FASTQ parsers
from Bio.SeqIO.FastaIO import SimpleFastaParser

count = 0
total_len = 0
with open("ls_orchid.fasta") as in_handle:
    for title, seq in SimpleFastaParser(in_handle):
        count += 1
        total_len += len(seq)

print("%i records with total sequence length %i" % (count, total_len))
out_handle.write(">%s\n%s\n" % (title, seq))

#解析fastq文件的轻量级脚本
from Bio.SeqIO.QualityIO import FastqGeneralIterator

count = 0
total_len = 0
with open("example.fastq") as in_handle:
    for title, seq, qual in FastqGeneralIterator(in_handle):
        count += 1
        total_len += len(seq)

print("%i records with total sequence length %i" % (count, total_len))
out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值