爬取豆瓣书籍信息并且保存为ris文件并导入Endnote

读书期间下载了许多的电子书,但是分类很乱。突发奇想,可以用endnote进行管理,但是手动输入书籍信息很麻烦,于是就想爬取豆瓣读书中的书籍信息。

首先用python爬取豆瓣中书籍的基本信息,如书名,作者,摘要等。然后将其保存为RIS文件,最后用endnote读取RIS文件。

实现GUI界面,输入豆瓣中书籍地址,即可爬取书籍信息,并且保存为RIS文件,可以导入各种文献管理软件之中。最后将其打包成exe文件,具体的下载地址如下:https://download.csdn.net/download/stromlord/12552065

爬取书籍信息,并保存为RIS文件:

def html_request(url, encodeing='utf-8', timeout=5, headers=None):
    response = requests.get(url, headers=headers, timeout=timeout)
    if response.status_code == 200:
        html = response.content.decode(encodeing, 'ignore')
    else:
        html = None
    return html


def html_read(url, encodeing='utf-8', timeout=5, headers=None):
    read_count = 0
    connect_count = 0
    html = ''
    while True:
        try:
            html = html_request(url, encodeing, timeout, headers=headers)
            break
        except requests.exceptions.ReadTimeout:
            time.sleep(3)
            print("ReadTimeout", end='')
            read_count = read_count + 1
            if read_count > 10:
                break
        except requests.exceptions.ConnectionError:
            time.sleep(3)
            print("ConnectionError", end='')
            connect_count = connect_count + 1
            if connect_count > 10:
                break
    return html


def book_info(book_url, ris_dir='', series=None, ris_flag=True):
    book_info_dict = {
        '书名': None,
        '作者': None,
        '摘要': None,
        '出版社': None,
        '副标题': None,
        '原作名': None,
        '译者': None,
        '出版年': None,
        '页数': None,
        '丛书': None,
        'ISBN': None,
        '标签': None
    }

    book_html = html_read(book_url, headers=DouBan_header)
    time.sleep(1.5)
    book_soup = BeautifulSoup(book_html, "html.parser")

    book_title = book_soup.select('title')
    title = str(book_title[0]).replace('<title>', '').replace('</title>', '').replace(' (豆瓣)', '')
    book_info_dict['书名'] = title

    book_basic_info = book_soup.select('#info')
    book_basic_info = str(book_basic_info)
    book_basic_info = book_basic_info.split('<br/>')

    for book_item in book_basic_info:
        # print(book_item)
        if '作者' in book_item:
            author_pattern = re.compile('<a .*?">(.*?)</a>')
            author_info = author_pattern.findall(book_item.replace('\n', ''))
            book_info_dict['作者'] = author_info
        elif '译者' in book_item:
            translator_pattern = re.compile('<a .*?">(.*?)</a>')
            translator_info = translator_pattern.findall(book_item.replace('\n', ''))
            book_info_dict['译者'] = translator_info
        elif '丛书' in book_item:
            series_pattern = re.compile('<a .*?">(.*?)</a>')
            series_info = series_pattern.findall(book_item)
            book_info_dict['丛书'] = str(series_info).replace("['", '').replace("']", '')

        else:
            book_item = book_item.replace('\n<span class="pl">', '').replace('[<div class="" id="info">', '')
            other_info = book_item.split(':</span> ')
            if (len(other_info) >= 2) and (other_info[0] in book_info_dict):
                content = other_info[1].replace('<br>', '').replace('</br>', '').replace('</div>', '').replace(']', '')
                book_info_dict[other_info[0]] = content

    keyword_info = book_soup.select('.indent span .tag')
    keywords = ''
    for keyword_a in keyword_info:
        keyword_pattern = re.compile('<a .*?>(.*?)</a>')
        keyword = keyword_pattern.findall(str(keyword_a))
        keywords = keywords + keyword[0] + ', '
    book_info_dict['标签'] = keywords

    book_intro = book_soup.select('.intro')
    if book_intro:
        book_abstract = str(book_intro[1]) if 'javascript' in str(book_intro[0]) else str(book_intro[0])
        book_abstract = book_abstract.replace('<p>', '').replace('<div class="intro">', ''). \
            replace('</div>', '').replace('</p>', '\n')
        book_info_dict['摘要'] = book_abstract

    for key in book_info_dict:
        if book_info_dict[key] is None:
            book_info_dict[key] = ''

    if ris_flag:
        if series is None:
            save_dir = ris_dir + book_info_dict['书名'].replace('/', '') + '.ris'
        else:
            save_dir = ris_dir + series + '.ris'
        save_dir = save_dir.replace('\n', '')

        with codecs.open(save_dir, mode='a', encoding='utf-8') as f:
            f.write("TY  - BOOK" + "\n")

            if len(book_info_dict['译者']) >= 1:
                for i in range(len(book_info_dict['译者'])):
                    f.write("A4  - " + str(book_info_dict['译者'][i]) + "\n")

            if not book_info_dict['摘要'] is None:
                f.write("AB  - " + book_info_dict['摘要'] + "\n")

            if len(book_info_dict['作者']) >= 1:
                for i in range(len(book_info_dict['作者'])):
                    f.write("AU  - " + str(book_info_dict['作者'][i]) + "\n")

            f.write("DA  - " + book_info_dict['出版年'] + "\n")
            f.write("PY  - " + book_info_dict['出版年'][0:4] + "\n")
            f.write("KW  - " + book_info_dict['标签'] + "\n")
            f.write("PB  - " + book_info_dict['出版社'] + "\n")
            f.write("SN  - " + book_info_dict['ISBN'] + "\n")
            f.write("T2  - " + book_info_dict['丛书'] + "\n")
            f.write("TI  - " + book_info_dict['书名'] + "\n")
            f.write("ER  - " + "\n")
            f.write("\n")
            return book_info_dict['书名']

用tkinter制作GUI,实现功能:

class tkURL(object):

    def __init__(self):
        self.top = Tk()
        self.entry = Entry(self.top, width=50)
        self.entry.pack()

        self.cwd = StringVar(self.top)
        self.cwd.set('None')

        self.label = Label(self.top, textvariable=self.cwd)
        self.label.pack()

        self.frame = Frame(self.top)
        self.bClip = Button(self.frame, text='Clip', command=self.entryClip,
                            activeforeground='white', activebackground='red')
        self.bUrl = Button(self.frame, text='URL', command=self.getEntry,
                           activeforeground='white', activebackground='blue')
        self.bQuit = Button(self.frame, text='Quit', command=self.top.quit,
                            activeforeground='white', activebackground='red')
        self.bClip.pack(side=LEFT)
        self.bUrl.pack(side=LEFT)
        self.bQuit.pack(side=RIGHT)
        self.frame.pack()

    def getEntry(self, ev=None):
        url = self.entry.get()
        title = book_info(url, ris_dir='')
        title = os.getcwd() + title
        self.cwd.set(title)

    def entryClip(self, ev=None):
        self.entry.delete(0, END)
        content = pyperclip.paste()
        self.entry.insert(0, content)

最后用pyinstaller制作exe。完成!

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值