BibtexParser
官方文档:https://bibtexparser.readthedocs.io/en/master/
简介
bibtex格式的文本的解析工具。
文本为标准的bibtex格式
bibtex = """@ARTICLE{Cesar2013,
author = {Jean César},
title = {An amazing title},
year = {2013},
volume = {12},
pages = {12--23},
journal = {Nice Journal},
abstract = {This is an abstract. This line should be long enough to test
multilines...},
comments = {A comment},
keywords = {keyword1, keyword2}
}
"""
with open('bibtex.bib', 'w') as bibfile:
bibfile.write(bibtex)
# 开始解析
import bibtexparser
with open('bibtex.bib') as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
print(bib_database.entries)
# 输出:
#[{'journal': 'Nice Journal',
# 'comments': 'A comment',
# 'pages': '12--23',
# 'abstract': 'This is an abstract. This line should be long enough to test\nmultilines...',
# 'title': 'An amazing title',
# 'year': '2013',
# 'volume': '12',
# 'ID': 'Cesar2013',
# 'author': 'Jean César',
# 'keyword': 'keyword1, keyword2',
# 'ENTRYTYPE': 'article'}]
文本不是标准的bibytex格式
需要自己做一个文本预处理,将其转为标准格式。否则,存在部分文章的信息解析不全。
自定义解析格式
新建一个py文件
# customization.py
# 自定义函数,固定写法,参数document是一个字典,代表一篇文章的信息
def author(document):
if 'author' in document:
if document['author']:
document['author'] = document['author'].lower().replace('\n', ' ').replace('\\', '').split(' and ')
else:
document['author'] = None
else:
document['author'] = None
return document
将自定义的格式,应用到解析的过程:
import bibtexparser
from bibtexparser.bparser import BibTexParser
from customization import *
"""
@article{ ISI:000602258800001,
Author = {Waterworth, Samantha C. and Isemonger, Eric W. and Rees, Evan R. and
Dorrington, Rosemary A. and Kwan, Jason C.},
Title = {{Conserved bacterial genomes from two geographically isolated peritidal
stromatolite formations shed light on potential functional guilds}},
Journal = {{ENVIRONMENTAL MICROBIOLOGY REPORTS}},
DOI = {{10.1111/1758-2229.12916}},
Early Access Date = {{DEC 2020}},
ISSN = {{1758-2229}},
ResearcherID-Numbers = {{Kwan, Jason/F-9589-2010}},
ORCID-Numbers = {{Kwan, Jason/0000-0001-9933-1536}},
Unique-ID = {{ISI:000602258800001}},
}
"""
def customizations(record):
record = author(record)
return record
def parse_bib_str(bib_str: str):
"""
传入bibtex格式的字符串,解析为以字典为元素的list
:param bib_str:
:return: list (item is dic)
"""
# 字符串预处理
bib_str = bib_str.replace('{{', '{').replace('}}', '}').replace('Early Access Date', 'Early-Access-Date').replace(
'Early Access Year', 'Early-Access-Year')
# api固定写法
parser = BibTexParser()
parser.customization = customizations
bib_datebase = bibtexparser.loads(bib_str, parser=parser)
return bib_datebase.entries
if __name__ == '__main__':
with open('1.bib',encoding='utf-8') as bib_file:
bib_str = bib_file.read()
entries = parse_bib_str(bib_str)
print(len(entries))
print(entries[0].get('author'))
for k,v in entries[0].items():
print('key:',k)
print('value:',v)
print('#'*50)
"""
['Waterworth, Samantha C.', 'Isemonger, Eric W.', 'Rees, Evan R.', 'Dorrington, Rosemary A.', 'Kwan, Jason C.']
key: unique-id
value: ISI:000602258800001
##################################################
key: orcid-numbers
value: Kwan, Jason/0000-0001-9933-1536
##################################################
key: researcherid-numbers
value: Kwan, Jason/F-9589-2010
##################################################
key: issn
value: 1758-2229
##################################################
key: early-access-date
value: DEC 2020
##################################################
key: doi
value: 10.1111/1758-2229.12916
##################################################
key: journal
value: ENVIRONMENTAL MICROBIOLOGY REPORTS
##################################################
key: title
value: Conserved bacterial genomes from two geographically isolated peritidal
stromatolite formations shed light on potential functional guilds
##################################################
key: author
value: ['Waterworth, Samantha C.', 'Isemonger, Eric W.', 'Rees, Evan R.', 'Dorrington, Rosemary A.', 'Kwan, Jason C.']
##################################################
key: ENTRYTYPE
value: article
##################################################
key: ID
value: ISI:000602258800001
##################################################
Process finished with exit code 0
"""