Biopython用户手册:Biopython Tutorial and Cookbook
e,g.1
def get_num(search_info):#返回query在nucleotide数据库中记录条数
handle = Entrez.egquery(term = search_info)
record = Entrez.read(handle)
for row in record["eGQueryResult"]:
if row["DbName"]=="nuccore":
n = row["Count"]
return n
>handle的类型
>record的类型与record对象值
<class 'Bio.Entrez.Parser.DictionaryElement'>
{'Term': 'Cryptotaenia[Organ] AND rbcl[gene]', 'eGQueryResult': [{'DbName': 'pubmed', 'MenuName': 'PubMed', 'Count': '0', 'Status': 'Term or Database is not found'}, {'DbName': 'pmc', 'MenuName': 'PubMed Central', 'Count': '12', 'Status': 'Ok'}, {'DbName': 'mesh', 'MenuName': 'MeSH', 'Count': '0', 'Status': 'Term or Database is not found'}, ...]}
record是一个字典,包含‘term’和‘eGQueryResult’两个键;eGQueryResult的值为一个列表,列表元素为字典,每个字典记录了一个数据库的搜索信息
e.g.2
def get_gi(search_info,num):#检索并保存检索结果记录的id
handle = Entrez.esearch(db="nucleotide",term=search_info,retmax = num)
record = Entrez.read(handle)
gi_lis = record["IdList"]
return gi_lis
>record类型及record值
<class 'Bio.Entrez.Parser.DictionaryElement'>
{'Count': '17', 'RetMax': '17', 'RetStart': '0', 'IdList': ['1677620849', '1829765892', '1815522133', '1597481740', '874509839', '874509815', '874509593', '1316027588', '1316025894', '313664050', '825715550', '642989129', '642989127', '642989125', '642989123', '685846972', '67078997'], 'TranslationSet': [{'From': 'Cryptotaenia[All Fields]', 'To': '"Cryptotaenia"[Organism] OR Cryptotaenia[All Fields]'}], 'TranslationStack': [{'Term': '"Cryptotaenia"[Organism]', 'Field': 'Organism', 'Count': '86', 'Explode': 'Y'}, {'Term': 'Cryptotaenia[All Fields]', 'Field': 'All Fields', 'Count': '145', 'Explode': 'N'}, 'OR', 'GROUP', {'Term': 'rbcl[gene]', 'Field': 'gene', 'Count': '315690', 'Explode': 'N'}, 'AND'], 'QueryTranslation': '("Cryptotaenia"[Organism] OR Cryptotaenia[All Fields]) AND rbcl[gene]', 'ErrorList': {'PhraseNotFound': [], 'FieldNotFound': ['Organ']}}
record是一个字典,比较重要的键有‘count’,值为记录数目;‘IdList’,值为一个包含记录id号的列表
e.g.3
def get_SeqInfo():#下载条目
SeqInfo = []
handle = Entrez.efetch(db="nucleotide", id="1677620849", rettype="gb", retmode="xml")
filename = "1677620849"
if not os.path.isfile(filename):
f = open(filename,'w')
f.write(handle.read())
f.close()
#rettype:[gb | fasta]
#retmode:[xml | test]
Traceback (most recent call last):
File "d:\python_workspace\建树\1获取与清理\demo.py", line 47, in <module>
get_SeqInfo(gi)
File "d:\python_workspace\建树\1获取与清理\demo.py", line 38, in get_SeqInfo
f.write(handle.read())
TypeError: write() argument must be str, not bytes#xml格式需要用Entrez.read(handle)进行解析,返回为
<class 'Bio.Entrez.Parser.ListElement'>;如果用handle.read()返回为<class 'bytes'>
#text格式直接用handle.read(),返回为<class 'str'>
*用seqIO解析得到的seqRecord中的几个常用对象
>.features
def get_SeqInfo():
SeqInfo = []
handle = Entrez.efetch(db="nucleotide", id="1677620849", rettype="gb", retmode="text")
record_iterator = SeqIO.parse(handle,'gb')
record = next(record_iterator)
features = record.features
print(features)
#seqIO.parse解析gb(text格式),返回seqRecord迭代器
#迭代器中为seqRecord对象
#.features属性返回一个列表
<class 'Bio.SeqRecord.SeqRecord'>
[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(568), strand=1), type='source'), SeqFeature(FeatureLocation(BeforePosition(0), AfterPosition(568), strand=1), type='gene'), SeqFeature(FeatureLocation(BeforePosition(0), AfterPosition(568), strand=1), type='CDS')]
#返回一个列表,列表元素为<class 'Bio.SeqFeature.SeqFeature'>,其中只包含分子类型(如基因)但不包含基因名字
>.annotations
def get_SeqInfo():
SeqInfo = []
handle = Entrez.efetch(db="nucleotide", id="1677620849", rettype="gb", retmode="text")
record_iterator = SeqIO.parse(handle,'gb')
record = next(record_iterator)
ano = record.annotations
{'molecule_type': 'DNA', 'topology': 'linear', 'data_file_division': 'PLN', 'date': '01-AUG-2020', 'accessions': ['MH658318'], 'sequence_version': 1, 'keywords': [''], 'source': 'chloroplast Cryptotaenia japonica', 'organism': 'Cryptotaenia japonica', 'taxonomy': ['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliopsida', 'eudicotyledons', 'Gunneridae', 'Pentapetalae', 'asterids', 'campanulids', 'Apiales', 'Apiaceae', 'Apioideae', 'Oenantheae', 'Cryptotaenia'], 'references': [Reference(title='DNA barcoding the Flora of Qinling Mt. in China', ...), Reference(title='Direct Submission', ...)], 'structured_comment': OrderedDict([('Assembly-Data', OrderedDict([('Sequencing Technology', 'Sanger dideoxy sequencing')]))])}
#返回为字典
>.id
返回条目id(versions)