Biopython -- SeqIO

最新推荐文章于 2024-04-17 10:03:46 发布

All_Will_Be_Fine噻

最新推荐文章于 2024-04-17 10:03:46 发布

阅读量1k

点赞数

本文链接：https://blog.csdn.net/jiangshandaiyou/article/details/121629905

版权

bioinfo 同时被 3 个专栏收录

65 篇文章 0 订阅

订阅专栏

python

39 篇文章 1 订阅

订阅专栏

Biopython

11 篇文章 0 订阅

订阅专栏

文章目录

读取genbank文件
parse sequence from compressed files
parse sequence from the net
parse SwissProt sequences from the net
parse sequence files as dictionaries
write sequence files
Low level FASTA and FASTQ parsers

from Bio import SeqIO

#parse or read sequence
Bio.SeqIO.parse("path_to_file","file_formate")  #通常用于解析记录多个SeqRecord的文件，并返回一个可迭代对象
Bio.SeqIO.read("path_to_file","file_formate")  #通常用于解析只记录单个SeqRecord的文件
# or pass he  handle to parse
with open("ls_orchid.gbk") as handle:
    SeqIO.parse(handle,"gb")

读取genbank文件

record_iterator = SeqIO.parse("ls_orchid.gbk","genbank")
# one 
for item in record_iterator:
    pirnt(item)
    
# two
first_record = next(record_iterator)
second_record = next(record_iterator)

# three
record_list = [record for record in record_iterator]
# or
record_list = list(SeqIO.parse("ls_orchid.gbk","genbank")) 

#得到SeqRecord形式的数据后，就可以按照上一届的学习进行数据的获取和操作了
seqrecord.id
seqrecord.seq
seqrecord.description
seqrecord.name
seqrecord.annotations
seqrecord.annotations.keys()
seqrecord.annotations.values()
seqrecord.annotations["source"]

parse sequence from compressed files

#parse sequence from compressed files
import gzip
from Bio import SeqIO
with gzip.open("ls_orchid.gbk.gz","rt") as handle:
    SeqIO.parse(handle,"gb")

import bz2
from Bio import SeqIO
with bz2.open("ls_orchid.gbk.bz2","rt") as handle:
    SeqIO.parse(handle,"gb")

parse sequence from the net

#parse sequence from the net
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id="6273291") as handle:
    seq_record = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))

AF191665.1 with 0 features

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id="6273291") as handle:
    seq_record = SeqIO.read(handle, "gb") # using "gb" as an alias for "genbank"
print("%s with %i features" % (seq_record.id, len(seq_record.features)))

AF191665.1 with 3 features

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"
with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id="6273291,6273290,6273289") as handle:
    for seq_record in SeqIO.parse(handle, "gb"):
        print("%s %s..." % (seq_record.id, seq_record.description[:50]))
        print("Sequence length %i, %i features, from: %s"% (len(seq_record),len(seq_record.features),seq_record.annotations["source"],))

AF191665.1 Opuntia marenae rpl16 gene; chloroplast gene for c...
Sequence length 902, 3 features, from: chloroplast Grusonia marenae
AF191664.1 Opuntia clavata rpl16 gene; chloroplast gene for c...
Sequence length 899, 3 features, from: chloroplast Grusonia clavata
AF191663.1 Opuntia bradtiana rpl16 gene; chloroplast gene for...
Sequence length 899, 3 features, from: chloroplast Grusonia bradtiana

parse SwissProt sequences from the net

#parse SwissProt sequences from the net
from Bio import ExPASy
from Bio import SeqIO
with ExPASy.get_sprot_raw("O23729") as handle:
    seq_record = SeqIO.read(handle, "swiss")
    print(seq_record.id)
    print(seq_record.name)
    print(seq_record.description)
    print(repr(seq_record.seq))
    print("Length %i" % len(seq_record))
    print(seq_record.annotations["keywords"])

parse sequence files as dictionaries

#sequence files as dictionaries
#Sequence files as Dictionaries { In memory
#This is very useful formoderately large files where you only need to access certain elements of the file, and makes for a nice quick’n dirty
#database. 
#You can use the function Bio.SeqIO.to_dict() to make a SeqRecord dictionary (in memory).
#By defaultthis will use each record’s identifier (i.e. the .id attribute) as the key.
orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.gbk", "genbank"))
list(orchid_dict.keys()) #Under Python 3 the dictionary methods like \.keys()\ and \.values()\ are iterators rather than lists
seq_record = orchid_dict["Z78475.1"]
seq_record.description
seq_record.seq

#Specifying the dictionary keys
def get_accession(record):
""""Given a SeqRecord, return the accession number as a string.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
    parts = record.id.split("|")
    assert len(parts) == 5 and parts[0] == "gi" and parts[2] == "emb"
    return parts[3]

orchid_dict = SeqIO.to_dict(SeqIO.parse("ls_orchid.fasta", "fasta"), key_function=get_accession) #自定义每个seqrecord的key
print(orchid_dict.keys())

#sequence files as dictionaries
#Sequence files as Dictionaries { In memory
#For larger files you should consider Bio.SeqIO.index(),
#it still returns a dictionary like object, this does not keep everything in memory. Instead, it just records where
#each record is within the file { when you ask for a particular record, it then parses it on demand.
orchid_dict = SeqIO.index("ls_orchid.gbk", "genbank")
orchid_dict.keys()
seq_record = orchid_dict["Z78475.1"]
seq_record.description
seq_record.seq
orchid_dict.close()  #Note that Bio.SeqIO.index() won’t take a handle, but only a filename.

#Specifying the dictionary keys
def get_acc(identifier):
""""Given a SeqRecord identifier string, return the accession number as a string.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
    parts = identifier.split("|")
    assert len(parts) == 5 and parts[0] == "gi" and parts[2] == "emb"
    return parts[3]

orchid_dict = SeqIO.index("ls_orchid.fasta", "fasta", key_function=get_acc)
print(orchid_dict.keys())

#Sequence files as Dictionaries { Database indexed files
#Bio.SeqIO.index_db(), which can work on even extremely largefiles since it stores the record information as a file on disk 
#(using an SQLite3 database) rather than in memory.

#先去下载一些大文件
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl2.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl3.seq.gz
$ curl -O ftp://ftp.ncbi.nih.gov/genbank/gbvrl4.seq.gz
$ gunzip gbvrl*.seq.gz

import glob
from Bio import SeqIO
files = glob.glob("gbvrl*.seq")
gb_vrl = SeqIO.index_db("gbvrl.idx", files, "genbank") #在硬盘上存在一个类似于数据库的gbvrl.idx文件，下次重复这个步骤时只是load index file gbvrl
#print("%i sequences indexed" % len(gb_vrl))
print(gb_vrl["AB811634.1"].description)
print(gb_vrl.get_raw("AB811634.1"))

write sequence files

#write sequence files
#Bio.SeqIO.write() which is for sequence output 
SeqIO.write(my_records, "my_example.faa", "fasta") #some SeqRecord objects, a handle or filename to write to, and a sequence format

#Converting between sequence file formats
from Bio import SeqIO
records = SeqIO.parse("ls_orchid.gbk", "genbank")
count = SeqIO.write(records, "my_example.fasta", "fasta")

# or
count = SeqIO.convert("ls_orchid.gbk", "genbank", "my_example.fasta", "fasta")  #无法做fastq to fasta

Low level FASTA and FASTQ parsers

#Low level FASTA and FASTQ parsers
from Bio.SeqIO.FastaIO import SimpleFastaParser

count = 0
total_len = 0
with open("ls_orchid.fasta") as in_handle:
    for title, seq in SimpleFastaParser(in_handle):
        count += 1
        total_len += len(seq)

print("%i records with total sequence length %i" % (count, total_len))
out_handle.write(">%s\n%s\n" % (title, seq))

#解析fastq文件的轻量级脚本
from Bio.SeqIO.QualityIO import FastqGeneralIterator

count = 0
total_len = 0
with open("example.fastq") as in_handle:
    for title, seq, qual in FastqGeneralIterator(in_handle):
        count += 1
        total_len += len(seq)

print("%i records with total sequence length %i" % (count, total_len))
out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))

All_Will_Be_Fine噻

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
Biopython -- SeqIO

文章目录读取genbank文件parse sequence from compressed filesparse sequence from the netparse SwissProt sequences from the netparse sequence files as dictionarieswrite sequence filesLow level FASTA and FASTQ parsersfrom Bio import SeqIO#parse or read sequenceBio
复制链接

扫一扫