import os
import re
import urllib.request
import urllib.error
from multiprocessing import Process
interval = 1000 # 间隔
process_start = 1 # 进程开始
process_end = 30 # 进程结束
base_path = r'f:\novel' # 文件保存路径
def get_novel(args, inr):
ite = (args - 1) * inr # 起始值
process = str(args) # 进程号
log_path = base_path + r'\log' + str(args) + '.txt' # 日志路径
base_url = 'https://www.x23qb.com' # 网站根路径
chapter_base_url = base_url + r'/book/' # 目录根路径
urls_complete_reg = compile_reg(r'<ul class="chaw_c" id="chapterList">(.*?)<div class="chaptername">',
re.S) # 章节url所在块
urls_reg = compile_reg(r'<li><a href="(.*?)">(第.*?)</a></li>', 0) # 章节url和标题 0表示按行匹配,re.S表示全匹配
title_reg = compile_reg(r'<title>(.*?)全.*?</title>', 0) # 书名
chapter_reg_has_next = compile_reg(r'<dt class="rd"><script>chapter.*?;</script></dt>(.*?)<p style=',
re.S) # 有下一章时的文章文本匹配
chapter_reg_no_next = compile_reg(r'<dt class="rd"><script>chapter.*?;</script></dt>(.*?)<p id="p1nt"',
re.S) # 无下一章时的文章文本匹配
if os.path.isfile(log_path): # 如果日志存在
with open(log_path, 'r') as f1:
log_list = f1.readlines()
ite = int(log_list[-1]) # 读取日志最后一行
ite = ite - 1 # 最后下载的可能未下载完全,需重新下载
while ite < args * inr: # 直到多少结束
ite = ite + 1
print('\033[1;34m进程:' + process + ' \033[0m 准备下载文章: ' + str(ite))
with open(log_path, 'a') as k:
k.write(str(ite) + '\n') # 写入日志
html = get_html(chapter_base_url + str(ite) + '/', process) # 获得目录html
urls_html = re.findall(urls_complete_reg, html) # 获得章节url所在块
title = re.findall(title_reg, html) # 获得标题
if not (title and urls_html): # 如果标题或url块为空
continue # 文章不存在,跳过此次循环
urls = re.findall(urls_reg, urls_html[0]) # 获得章节url
title[0] = title[0].replace('?', '') # 标题不能出现文件名不允许的符号
novel_path = os.path.join(base_path, str(ite) + ' ' + title[0] + '.txt') # 文件存储路径
if os.path.exists(novel_path): # 如果文件存在
os.remove(novel_path) # 删除文件
for url in urls:
count = 1 # 有下一页的章节,页码计数 例:230305 230305_2 230305_3
chapter_url = base_url + url[0] # 章节的超链接
chapter_title = url[1] # 章节的名字
chapter_html = get_html(chapter_url, process) # 章节源码
chapter_content = re.findall(chapter_reg_has_next, chapter_html) # 使用有下一页的正则去匹配
if chapter_content: # 有下一页
chapter_content[0] = chapter_content[0][:-4] # 去掉后4个字符
while True:
count = count + 1 # 第二页页码从2开始,以此类推
chapter_next_url = chapter_url.replace('.html', '') + '_' + str(count) + '.html' # 拼接章节下一页地址
chapter_next_html = get_html(chapter_next_url, process) # 下一页源码
chapter_next_content = re.findall(chapter_reg_has_next, chapter_next_html) # 下一页文章主体
if chapter_next_content: # 如果有下一页
chapter_next_content[0] = chapter_next_content[0][8:-10] # 去掉前8位和后10位
chapter_content[0] = chapter_content[0] + chapter_next_content[0] # 将下一页加入文章主体中
else: # 直到无下一页就退出循环
chapter_next_content = re.findall(chapter_reg_no_next, chapter_next_html) # 用无下一页的正则去匹配
chapter_next_content[0] = chapter_next_content[0][8:-10] # 去掉前8位和后10位
chapter_content[0] = chapter_content[0] + chapter_next_content[0] # 将下一页加入文章主体中
break
else: # 无下一页
chapter_content = re.findall(chapter_reg_no_next, chapter_html) # 只有一页
for content in chapter_content: # 打印章节的内容
content = chapter_title + content # 将章节名加到章节最前面
content = content.replace('</p>', '\n') # 把'</p>'字符全部替换为'\n'
content = content.replace('<p>', '') # 把'<p>'字符全部替换为''
with open(novel_path, 'a') as f:
f.write(content) # 写入章节文本
print('\033[1;34m进程:' + process + ' \033[0m' + str(ite) + ' ' + title[
0] + ' ' + chapter_title + ' 下载成功') # 控制台打印日志
def get_html(url, process):
count = 0
while True:
count = count + 1
try:
html = urllib.request.urlopen(url, timeout=10).read() # 正文内容源代码
if html:
break
except:
print('\033[1;34m进程:' + process + '\033[0m \033[1;31m网络连接出现问题, 正在尝试再次请求第 ' + str(count) + ' 次\033[0m')
html = html.decode('gbk', 'ignore')
return html
def compile_reg(reg, flags):
return re.compile(reg, flags)
if __name__ == "__main__":
for i in range(process_start, process_end + 1): # 循环启动进程
p = Process(target=get_novel, args=(i, interval))
p.start()
print('进程: ' + str(i) + '启动成功')
一个简单的小说网站爬虫
最新推荐文章于 2024-04-06 13:32:08 发布