数据来源:https://ftp.ncbi.nlm.nih.gov/refseq/release/viral/
codebook:Dealing with GenBank files in Biopython
gbff 文件解析
gbff是genbank格式的一种变体
使用Biopython中的Bio.SeqIO模块进行解析
from Bio import SeqIO
from tqdm import tqdm
# 读取genbank文件
gb_file = "viral.all.genomic.gbff"
gb_ls = []
for gb_record in tqdm(SeqIO.parse(open(gb_file,'r'),'genbank')):
gb_ls.append(gb_record)
import pandas as pd
df_genbank = pd.DataFrame(columns=['seq_num',
'access',
'genpart',
'organ',
'source',
'tax',
'topo',
'order','family','subfamily','genus',
'Baltimore class',
'Definition',
'Sequence_Length'])
for i,gb_record in tqdm(enumerate(gb_ls)):
_accessions = gb_record.annotations['accessions']
_genpart = gb_record.annotations['molecule_type']
_organism = gb_record.annotations['organism']
_source = gb_record.annotations['source']
_taxonomy = gb_record.annotations['taxonomy']
_topology = gb_record.annotations['topology']
_order = []
_family = []
_subfamily = []
_genus = []
_Baltimore = []
_definition = gb_record.description
_seqlen = len(gb_record.seq)
# 加tax分类学信息
for j in _taxonomy:
if 'virales' in j:
_order = j
if 'viridae' in j:
_family = j
if 'virinae' in j:
_subfamily = j
if 'virus' in j:
_genus = j
# Baltimore 分类,似乎缺少信息,
baltimore_mapping = {
'DNA' : 'I',
'ss-DNA': 'II',
'ds-RNA': 'III',
'': 'IV',
'': 'V',
'': 'VI',
'': 'VII'
}
_ = [i+1,
_accessions,
_genpart,
_organism,
_source,
_taxonomy,
_topology,
_order,_family,_subfamily,_genus,
_Baltimore,
_definition,
_seqlen]
df_genbank.loc[i] = _
df_genbank
example in viral.all.genomic.gbff
LOCUS NC_001798 154675 bp DNA linear VRL 16-MAY-2016
DEFINITION Human herpesvirus 2 strain HG52, complete genome.
ACCESSION NC_001798
VERSION NC_001798.2
DBLINK BioProject: PRJNA15218
KEYWORDS RefSeq.
SOURCE Human alphaherpesvirus 2 (Herpes simplex virus 2)
ORGANISM Human alphaherpesvirus 2
Viruses; Duplodnaviria; Heunggongvirae; Peploviricota;
Herviviricetes; Herpesvirales; Herpesviridae; Alphaherpesvirinae;
Simplexvirus.
REFERENCE 1 (bases 1 to 154675)
AUTHORS Davison,A.J.
TITLE Evolution of sexually-transmitted and -transmissible human
herpesviruses
JOURNAL Unpublished
REFERENCE 2 (bases 1 to 154675)
CONSRTM NCBI Genome Project
TITLE Direct Submission
JOURNAL Submitted (12-MAY-2015) National Center for Biotechnology
Information, NIH, Bethesda, MD 20894, USA
REFERENCE 3 (bases 1 to 154675)
AUTHORS Holton,M. and Davison,A.J.
TITLE Direct Submission
JOURNAL Submitted (01-NOV-2013) MRC - University of Glasgow Centre for
Virus Research, 8 Church Street, Glasgow G11 5JR, UK
REMARK Sequence update by submitter
REFERENCE 4 (bases 1 to 154675)
AUTHORS Davison,A.J.
TITLE Direct Submission
JOURNAL Submitted (05-AUG-2011) MRC - University of Glasgow Centre for
Virus Research, 8 Church Street, Glasgow G11 5JR, UK
COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence is identical to JN561323.
On May 12, 2015 this sequence version replaced NC_001798.1.
The original gene nomenclature has been retained. Genes presumably
inherited from the common ancestor of alpha-, beta- and
gammaherpesviruses (core genes) and non-core genes presumably
inherited from the ancestor of alphaherpesviruses (alpha genes) are
indicated. Initiation codons are assigned with as much confidence
as is possible for each protein-coding region. A standard protein
nomenclature has been applied so that orthologs have the same name
in all herpesviruses.
##Assembly-Data-START##
Assembly Method :: Maq v. 0.7.1, BWA v. 0.6.2-r126
Sequencing Technology :: Illumina
##Assembly-Data-END##
COMPLETENESS: full length.
FEATURES Location/Qualifiers
source 1..154675
/organism="Human alphaherpesvirus 2"
/mol_type="genomic DNA"
/strain="HG52"
/isolation_source="herpetic ulcer near a woman's anus"
/host="Homo sapiens"
/db_xref="taxon:10310"
/country="United Kingdom"
/collected_by="J.F. Peutherer or C.A.C. Ross"
/acronym="HHV-2"
/acronym="HSV-2"
/note="collected prior to 1971; originally called strain
HSG 52; genes UL41 and US8 are frameshifted in majority
and minority populations, respectively; original sequence
(INSD accession Z86099) determined by Sanger sequencing of
plasmids amended by Illumina sequencing of both viral DNA
and polyA RNA harvested at 10 h after infection"
repeat_region 1..9300
/note="TRL; inverted repeat flanking UL"
/rpt_type=inverted
gene complement(<1..7814)
/gene="LAT"
/locus_tag="HHV2s01"
/db_xref="GeneID:24271495"
ncRNA complement(join(<1..4838,7065..7767))
/ncRNA_class="other"
/gene="LAT"
/locus_tag="HHV2s01"
/product="LAT"
/db_xref="GeneID:24271495"
repeat_region 1..254
/note="'a' sequence"
/rpt_type=direct
/rpt_type=inverted
/rpt_type=terminal
variation 11
/gene="LAT"
/locus_tag="HHV2s01"
/replace="c"
gene 440..1743
/gene="RL1"
/locus_tag="HHV2p77"
/db_xref="GeneID:1487286"
CDS join(440..934,1089..1379)
/gene="RL1"
/locus_tag="HHV2p77"
/note="virion protein; inhibits stress-induced
translational arrest; related to eIF2 phosphatase
regulatory subunit GADD34; binds protein phosphatase 1 to
form a holoenzyme capable of dephosphorylating eIF2-alpha;
involved in translational regulation"
/codon_start=1
/product="neurovirulence protein ICP34.5"
/protein_id="YP_009137150.1"
/db_xref="GeneID:1487286"
/translation="MSRRRGPRRRGPRRRPRPGAPAVPRPGAPAVPRPGALPTADSQM
VPAYDSGTAVESAPAASSLLRRWLLVPQADDSDDADYAGNDDAEWANSPPSEGGGKAP
EAPHAAPAAACPPPPPRKERGPQRPLPPHLALRLRTTTEYLARLSLRRRRPPASPPAD
APRGKVCFSPRVQVRHLVAWETAARLARRGSWARERADRDRFRRRVAAAEAVIGPCLE
PEARARARARARAHEDGGPAEEEEAAAAARGSSAAAGPGRRAV"