一个简单的小说网站爬虫

最新推荐文章于 2022-08-13 10:41:22 发布

mobclix

最新推荐文章于 2022-08-13 10:41:22 发布

阅读量280

点赞数

本文链接：https://blog.csdn.net/mobclix/article/details/115635981

版权

import os
import re
import urllib.request
import urllib.error
from multiprocessing import Process

interval = 1000  # 间隔
process_start = 1  # 进程开始
process_end = 30  # 进程结束
base_path = r'f:\novel'  # 文件保存路径


def get_novel(args, inr):
    ite = (args - 1) * inr  # 起始值
    process = str(args)  # 进程号
    log_path = base_path + r'\log' + str(args) + '.txt'  # 日志路径
    base_url = 'https://www.x23qb.com'  # 网站根路径
    chapter_base_url = base_url + r'/book/'  # 目录根路径
    urls_complete_reg = compile_reg(r'<ul class="chaw_c" id="chapterList">(.*?)<div class="chaptername">',
                                    re.S)  # 章节url所在块
    urls_reg = compile_reg(r'<li><a href="(.*?)">(第.*?)</a></li>', 0)  # 章节url和标题 0表示按行匹配，re.S表示全匹配
    title_reg = compile_reg(r'<title>(.*?)全.*?</title>', 0)  # 书名
    chapter_reg_has_next = compile_reg(r'<dt class="rd"><script>chapter.*?;</script></dt>(.*?)<p style=',
                                       re.S)  # 有下一章时的文章文本匹配
    chapter_reg_no_next = compile_reg(r'<dt class="rd"><script>chapter.*?;</script></dt>(.*?)<p id="p1nt"',
                                      re.S)  # 无下一章时的文章文本匹配
    if os.path.isfile(log_path):  # 如果日志存在
        with open(log_path, 'r') as f1:
            log_list = f1.readlines()
            ite = int(log_list[-1])  # 读取日志最后一行
    ite = ite - 1  # 最后下载的可能未下载完全，需重新下载
    while ite < args * inr:  # 直到多少结束
        ite = ite + 1
        print('\033[1;34m进程:' + process + ' \033[0m 准备下载文章: ' + str(ite))
        with open(log_path, 'a') as k:
            k.write(str(ite) + '\n')  # 写入日志
        html = get_html(chapter_base_url + str(ite) + '/', process)  # 获得目录html
        urls_html = re.findall(urls_complete_reg, html)  # 获得章节url所在块
        title = re.findall(title_reg, html)  # 获得标题
        if not (title and urls_html):  # 如果标题或url块为空
            continue  # 文章不存在，跳过此次循环
        urls = re.findall(urls_reg, urls_html[0])  # 获得章节url
        title[0] = title[0].replace('?', '')  # 标题不能出现文件名不允许的符号
        novel_path = os.path.join(base_path, str(ite) + ' ' + title[0] + '.txt')  # 文件存储路径
        if os.path.exists(novel_path):  # 如果文件存在
            os.remove(novel_path)  # 删除文件
        for url in urls:
            count = 1  # 有下一页的章节，页码计数 例：230305 230305_2 230305_3
            chapter_url = base_url + url[0]  # 章节的超链接
            chapter_title = url[1]  # 章节的名字
            chapter_html = get_html(chapter_url, process)  # 章节源码
            chapter_content = re.findall(chapter_reg_has_next, chapter_html)  # 使用有下一页的正则去匹配
            if chapter_content:  # 有下一页
                chapter_content[0] = chapter_content[0][:-4]  # 去掉后4个字符
                while True:
                    count = count + 1  # 第二页页码从2开始，以此类推
                    chapter_next_url = chapter_url.replace('.html', '') + '_' + str(count) + '.html'  # 拼接章节下一页地址
                    chapter_next_html = get_html(chapter_next_url, process)  # 下一页源码
                    chapter_next_content = re.findall(chapter_reg_has_next, chapter_next_html)  # 下一页文章主体
                    if chapter_next_content:  # 如果有下一页
                        chapter_next_content[0] = chapter_next_content[0][8:-10]  # 去掉前8位和后10位
                        chapter_content[0] = chapter_content[0] + chapter_next_content[0]  # 将下一页加入文章主体中
                    else:  # 直到无下一页就退出循环
                        chapter_next_content = re.findall(chapter_reg_no_next, chapter_next_html)  # 用无下一页的正则去匹配
                        chapter_next_content[0] = chapter_next_content[0][8:-10]  # 去掉前8位和后10位
                        chapter_content[0] = chapter_content[0] + chapter_next_content[0]  # 将下一页加入文章主体中
                        break
            else:  # 无下一页
                chapter_content = re.findall(chapter_reg_no_next, chapter_html)  # 只有一页
            for content in chapter_content:  # 打印章节的内容
                content = chapter_title + content  # 将章节名加到章节最前面
                content = content.replace('</p>', '\n')  # 把'</p>'字符全部替换为'\n'
                content = content.replace('<p>', '')  # 把'<p>'字符全部替换为''
                with open(novel_path, 'a') as f:
                    f.write(content)  # 写入章节文本
                    print('\033[1;34m进程:' + process + ' \033[0m' + str(ite) + ' ' + title[
                        0] + ' ' + chapter_title + ' 下载成功')  # 控制台打印日志


def get_html(url, process):
    count = 0
    while True:
        count = count + 1
        try:
            html = urllib.request.urlopen(url, timeout=10).read()  # 正文内容源代码
            if html:
                break
        except:
            print('\033[1;34m进程:' + process + '\033[0m \033[1;31m网络连接出现问题, 正在尝试再次请求第 ' + str(count) + ' 次\033[0m')
    html = html.decode('gbk', 'ignore')
    return html


def compile_reg(reg, flags):
    return re.compile(reg, flags)


if __name__ == "__main__":
    for i in range(process_start, process_end + 1):  # 循环启动进程
        p = Process(target=get_novel, args=(i, interval))
        p.start()
        print('进程: ' + str(i) + '启动成功')

mobclix

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
一个简单的小说网站爬虫

import osimport reimport urllib.requestimport urllib.errorfrom multiprocessing import Processinterval = 1000 # 间隔process_start = 1 # 进程开始process_end = 30 # 进程结束base_path = r'f:\novel' # 文件保存路径def get_novel(args, inr): ite = (args - 1) *
复制链接

扫一扫