python爬取小说(二)书籍基本信息爬取

爬完数据目录和内容后,我们来爬取书籍的基本信息。
在上篇博客的基础上,爬取书籍信息并存入字典
这里写图片描述

# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3

def getHtml(url):
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    headers = {"User-Agent":user_agent}
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    return html


# 爬取整个网页
def parse(url):
    html_doc = getHtml(url)
    sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
    return sp

# 爬取书籍基本信息
def get_book_baseinfo(url):
    # class = "info"信息获取
    info = parse(url).find('div',class_ = 'info')
    book_info = {}
    if info:
        book_info['title'] = ''
        book_info['img'] = ''
        # 标题
        book_info['title'] = info.find('h2').string
        
        # 图片链接
        img = info.find('div',class_ = 'cover')
        for im in img.children:
            # 图片地址想要访问,显然需要拼接
            book_info['img'] = 'http://www.biqukan.com' + im.attrs['src']

        # 基本信息存储
        ifo = info.find('div',class_ = 'small')
        bkinfo = []
        for b in ifo:
            for v in b.children:
                t = v.string
                if t:
                    bkinfo.append(''.join(t))

        # 将:后面的信息连起来
        spv = []
        cv = ''
        for v in bkinfo:
            if v.find(':') >= 0:
                if cv:
                    spv.append(cv)
                cv = v
            else:
                cv += v
        spv.append(cv)

        # 基本信息转成字典
        for element in spv:
            its = [v.strip() for v in element.split(':')]
            if len(its) != 2:
                continue
            nm = its[0].lower()  # 统一成小写
            if type(nm).__name__ == 'unicode':
                nm = nm.encode('utf-8')
            vu = its[1]
            book_info[nm] = vu
            
        # 发现这里获取到的字典键与后面将要获取的键重复了,所以这里改一下
        book_info['auther'] = book_info.pop('作者')

        #简介获取(与基本信息的获取方式一致)
        intro = info.find('div',class_ = 'intro')
        bkurl = []
        for b in intro:
            t = b.string
            if t:
                bkurl.append(''.join(t))

        bkjj = []
        cvx = ''
        for w in bkurl:
            if w.find(':') >= 0:
                if cvx:
                    bkjj.append(cvx)
                cvx = w
            else:
                cvx += w
        bkjj.append(cvx)

        for ele in bkjj:
            itis = [n.strip() for n in ele.split(':')]
            if len(itis) != 2:
                continue
            summ = itis[0].lower()  # 统一成小写
            if type(summ).__name__ == 'unicode':
                summ = summ.encode('utf-8')
            vux = itis[1]
            book_info[summ] = vux

    # 使用笨办法将字典的key转成英文状态,这样方便数据库存储
    book_info['type'] = book_info.pop('分类')
    book_info['status'] = book_info.pop('状态')
    book_info['num'] = book_info.pop('字数')
    book_info['updatatime'] = book_info.pop('更新时间')
    book_info['newchapter'] = book_info.pop('最新章节')
    book_info['authsummery'] = book_info.pop('作者')
    book_info['summery'] = book_info.pop('简介')
    book_info['notipurl'] = book_info.pop('无弹窗推荐地址')

    return book_info



# 获取书籍目录
def get_book_dir(url):
    books_dir = []
    name = parse(url).find('div', class_='listmain')
    if name:
        dd_items = name.find('dl')
        dt_num = 0
        for n in dd_items.children:
            ename = str(n.name).strip()
            if ename == 'dt':
                dt_num += 1
            if ename != 'dd':
                continue
            Catalog_info = {}
            if dt_num == 2:
                durls = n.find_all('a')[0]
                Catalog_info['name'] = (durls.get_text())
                Catalog_info['url'] = 'http://www.biqukan.com' + durls.get('href')
                books_dir.append(Catalog_info)
    return books_dir


# 获取章节内容
def get_charpter_text(curl):
    # 直接根据curl地址取章节内容就行了获取一篇文章??!!!
    #  这是一个地址,你不要打开网页获取网页内容?这不是已经
    text = parse(curl).find('div', class_='showtxt')
    if text:
        cont = text.get_text()
        cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')]
        c = " ".join(cont)
        ctext = re.findall(r'^.*?html', c)
        return ctext
    else:
        return ''


# 获取书籍
def get_book(burl):
    # 目录
    book = get_book_dir(burl)
    if not book:
        return book

    # 内容
    for d in book:
        curl = d['url']
        try:
            print('正在获取章节【{}】【内容】【{}】'.format(d['name'],d['url']))
            ctext = get_charpter_text(curl)
            d['text'] = ctext
            print(d['text'])
            print()
        except Exception as err:
            d['text'] = 'get failed'

    return book


if __name__ == '__main__':
	# 这里调用get_book_baseinfo函数看看效果
    book = get_book_baseinfo('http://www.biqukan.com/1_1094/')
    print(book)

结果展示:

{'title': '一念永恒', 'img': 'http://www.biqukan.com/files/article/image/1/1094/1094s.jpg', 'auther': '耳根', 'type': '玄幻小说', 'status': '连载', 'num': '3689058', 'updatatime': '2018-02-09 18:20:00', 'newchapter': '第1314章 你的选择(终)', 'authsummery': '耳根所写的《一念永恒》无弹窗免费全文阅读为转载作品,章节由网友发布。', 'summery': '一念成沧海,一念化桑田。一念斩千魔,一念诛万仙。唯我念……永恒', 'notipurl': 'http://www.biqukan.com/1_1094/?_t_t_t=0.4355400702253367'}

后面我们将这些信息存储到数据库。
创作不易,喜欢的话给喵喵赏点吧~(可怜眼神)
在这里插入图片描述

  • 4
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值