Python3.7爬虫：爬取小说并写入txt文件 ---趣笔阁

最新推荐文章于 2024-05-30 10:25:47 发布

UUTF-8

最新推荐文章于 2024-05-30 10:25:47 发布

阅读量4.9k

点赞数 2

分类专栏： Python 文章标签： Python

本文链接：https://blog.csdn.net/q764720734/article/details/82740611

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import re

import requests
from bs4 import BeautifulSoup

# 请求字典
req_header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch, br',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': 'UM_distinctid=165c0eb51a95a7-0c5f1e30c8c4cd-323f5c0f-144000-165c0eb51aa7c8; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260821856=941700315-1536539302-https%253A%252F%252Fwww.baidu.com%252F%7C1536648034',
    'Host': 'www.biqugex.com',
    'If-Modified-Since': 'Mon, 25 Jun 2018 00:09:02 GMT',
    'If-None-Match': '"1529885342"',
    'Referer': 'https://www.biqugex.com/s.php?ie=gbk&s=9157106854577873494&q=%C6%DF%BD%E7%CE%E4%C9%F1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# 小说下载函数
# id：小说编号
# txt字典项介绍
# title：小说题目
# first_page：第一章页面
# txt_section：章节地址
# section_name：章节名称
# section_text：章节正文
# section_ct：章节页数
req_url_base = 'https://www.biqugex.com'  # 小说主页


def get_txt(txt_id):
    txt = {}
    txt['title'] = ''
    txt['id'] = str(txt_id)

    try:
        print("输入要下载的小说编号")
        txt['id'] = input()
        req_url = req_url_base + '/book_' + txt['id'] + '/'  # 获取小说url
        # print('小说路径：'+ req_url)
        res = requests.get(req_url, params=req_header)  # 获取小说目录界面
        soups = BeautifulSoup(res.text, "html.parser")  # soups 转换
        txt['title'] = soups.select('.book .info h2')[0].text  # 获取小说题目
        txt['author'] = soups.select('.book .info .small span')  # 获取小说详细信息
        txt['intro'] = soups.select('.book .info .intro ')  # 获取小说简介
        txt['author'][0].text  # 作者
        txt['author'][1].text  # 分类
        txt['author'][2].text  # 是否连载
        txt['author'][3].text  # 字数
        txt['author'][4].text  # 最后更新时间
        txt['author'][5].text  # 最新章节
        first_page = soups.select('.listmain dl dd a')  # 获取章节地址
     #   first_page = first_page[6]['href']  # 截取小说第一章
     #   section_next = first_page
        list = []
        # 循环截取每章地址存入list
        for i in range(len(first_page)):
            list.append(first_page[i]['href'])
        io = open('{0:0>8}-{1}.txt'.format(txt['id'], txt['title']), "ab+")
        io.write(txt['title'].encode('UTF-8'))  # 写入小说名字
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][0].text.encode('UTF-8'))  # 写入作者
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][1].text.encode('UTF-8'))  # 写入分类
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][2].text.encode('UTF-8'))  # 写入是否连载
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][3].text.encode('UTF-8'))  # 写入字数
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][4].text.encode('UTF-8'))  # 写入最后更新时间
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['author'][5].text.encode('UTF-8'))  # 写入最近章节
        io.write('\r\n'.encode('UTF-8'))
        io.write(txt['intro'][0].text.encode('UTF-8'))  # 写入小说简介
        io.write('\r\n'.encode('UTF-8'))
        # 进入循环 下载每章内容
        # 循环每章内容写入txt
        for i, val in enumerate(list[6:]):
            # print(req_url_base + val)
            # r = requests.get(str(req_url_base + str(first_page)), params=req_header)
            r = requests.get(str(req_url_base + val), params=req_header)
            soup = BeautifulSoup(r.text, "html.parser")  # soup转换
            section_name = soup.select('#wrapper .book.reader .content h1')[0]  # 获取章节名称
            section_text = soup.select('#wrapper .content .showtxt')[0]  # 获取章节文本
            # 删除无用项
            for ss in section_text.select("script"):
                ss.decompose()
            section_text = re.sub('\s+', '\r\n\t', section_text.text).strip('\r\n')  # 格式化
            #   section_next = soup.select('#wrapper .book.reader .content .page_chapter ul li a')[2]  # 获取分页
            #   section_next = section_next['href']  # 截取下一章地址
            #   print(section_next)
            # 开始写入内容
            io.write('\n'.encode('UTF-8'))
            io.write(section_name.text.encode('UTF-8'))  # 写入标题
            io.write('\n'.encode('UTF-8'))
            io.write(section_text.encode('UTF-8'))  # 写入小说内容
            io.write('\n'.encode('UTF-8'))
            print('下载 ', section_name, '完成')  # 提示

    finally:
        print('{0:0>8}-{1}.txt'.format(txt['id'], txt['title']), '下载完成')
        io.close()
get_txt(99999)

更新时间：2018年9月17日15:15:48

UUTF-8

关注

2
点赞
踩
13

收藏

觉得还不错? 一键收藏
1
评论
Python3.7爬虫：爬取小说并写入txt文件 ---趣笔阁

版权声明：本文为博主原创文章，未经博主允许不得转载。import reimport requestsfrom bs4 import BeautifulSoup# 请求字典req_header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'...
复制链接

扫一扫