依赖库
核心用到网络请求库及html分析库:requests,BeautifulSoup
from bs4 import BeautifulSoup
import requests
from utils.xfile import XFile
请求网页内容
随便整一个请求header, 注意,有些网站会过滤掉windows header请求,可相应地构造手机浏览器的header
@staticmethod
def get_response(html_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url=html_url, headers=headers)
response.encoding = response.apparent_encoding
return response
下载单个章节内容
chapter_url :单个章节的url,然后提取出相关的章节名字及其文章内容。
file:用于保存文本内容的file对象。
最后返回下一章节的url
def get_one_chapter(self, chapter_url, file):
# 调用请求网页数据函数
response = self.get_response(chapter_url)
# 将获取网页内容加载到 BeautifulSoup分析器
soup = BeautifulSoup(response.text, 'html.parser')
# 章节名
chapter_name = soup.find('div', {'class': 'bookname'}).find('h1').text
print(f'当前章节:{chapter_name}')
file.write(f"{chapter_name}\n")
# 下一章节
next_chapter = soup.find('div', {'class': 'bookname'}).find('a', {'class': 'next'})
print(f"下一章节:{next_chapter['href']}")
# 章节文本内容
content_lines = soup.find('div', {'id': 'content'}).find_all('p', {'class': 'content_detail'})
for line in content_lines:
new_txt = line.text.replace("\r", "").replace("\n", "").replace("\t", "").strip()
file.write(f"{new_txt}\n")
file.write(f"\n\n")
# 返回下一章节路径,只返回相对路径('/book/66666/777771123.html'),完整路径要自己拼
return next_chapter['href']
下载指定章节范围的所有文章
save_path:要保存的小说路径
代码比较少,一看就明白。按照传入的文章数量循环下载每个章节。
def download_chapters(self, save_path, chapter_count):
"""
save_path 要保存的文件全路径,如: d:\123.txt
chapter_count 要保存的章节数,如: 要下载6章节,则填写6
"""
next_chapter_url = None
# 保存的文件
save_file = XFile(save_path, 'w+')
for index in range(chapter_count):
try:
if next_chapter_url is None:
next_chapter_url = self.get_one_chapter(self._first_chapter_url, save_file)
else:
new_url = self._root_url + next_chapter_url
next_chapter_url = self.get_one_chapter(new_url, save_file)
time.sleep(0.5)
except Exception as e:
print(f"出错:{e}")
break
# 正常关闭文件以flush
save_file.close()
整个实现类的代码
class BiQuGeNovel:
"""
Created by cjb
挖某趣阁的小说
"""
def __init__(self, root_url, first_chapter_url):
self._root_url = root_url
self._first_chapter_url = root_url + first_chapter_url
@staticmethod
def get_response(html_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url=html_url, headers=headers)
response.encoding = response.apparent_encoding
return response
def download_chapters(self, save_path, chapter_count):
"""
save_path 要保存的文件全路径,如: d:\123.txt
chapter_count 要保存的章节数,如: 要下载6章节,则填写6
"""
next_chapter_url = None
# 保存的文件
save_file = XFile(save_path, 'w+')
for index in range(chapter_count):
try:
if next_chapter_url is None:
next_chapter_url = self.get_one_chapter(self._first_chapter_url, save_file)
else:
new_url = self._root_url + next_chapter_url
next_chapter_url = self.get_one_chapter(new_url, save_file)
time.sleep(0.5)
except Exception as e:
print(f"出错:{e}")
break
# 正常关闭文件以flush
save_file.close()
def get_one_chapter(self, chapter_url, file):
# 调用请求网页数据函数
response = self.get_response(chapter_url)
# 将获取网页内容加载到 BeautifulSoup分析器
soup = BeautifulSoup(response.text, 'html.parser')
# 章节名
chapter_name = soup.find('div', {'class': 'bookname'}).find('h1').text
print(f'当前章节:{chapter_name}')
file.write(f"{chapter_name}\n")
# 下一章节
next_chapter = soup.find('div', {'class': 'bookname'}).find('a', {'class': 'next'})
print(f"下一章节:{next_chapter['href']}")
# 章节文本内容
content_lines = soup.find('div', {'id': 'content'}).find_all('p', {'class': 'content_detail'})
for line in content_lines:
new_txt = line.text.replace("\r", "").replace("\n", "").replace("\t", "").strip()
file.write(f"{new_txt}\n")
file.write(f"\n\n")
# 返回下一章节路径,只返回相对路径('/book/66666/77777123.html'),完整路径要自己拼
return next_chapter['href']
def test_chapter(self):
# 调用请求网页数据函数,保存为文件方便重复分析
# response = self.get_response(self._first_chapter_url)
# html_file = XFile('d:\\1a.txt', 'w')
# html_file.write(response.text)
# html_file.close()
read_file = XFile('d:\\1a.txt', 'r')
text = read_file.read()
read_file.close()
# 获取网页内容
soup = BeautifulSoup(text, 'html.parser')
# 获取下一页和尾页,以计算总共有多少页
book_name = soup.find('div', {'class': 'bookname'}).find('h1').text
print(f'书名:{book_name}')
next_chapter = soup.find('div', {'class': 'bookname'}).find('a', {'class': 'next'})
print(f"下一章节:{next_chapter['href']}")
content_lines = soup.find('div', {'id': 'content'}).find_all('p', {'class': 'content_detail'})
for line in content_lines:
new_txt = line.text.replace("\r", "").replace("\n", "").replace("\t", "").strip()
print(f'line:"{new_txt}"')
使用
此地址是随便填写的,真正使用时请自行搜索相关网站。
# 输入第一章节的地址,自动从内容中获取下一章节并自动下载保存内容
novel = BiQuGeNovel('https://www.aaaaa.vip', '/book/66666/77777660.html')
novel.download_chapters("d:\\ddd.txt", 2) # 下载两章内容
补充:文件读写
import os
import shutil
class XFile:
"""
Created by cjb for reading / writing file
读 ====> r, rb
写 ====> w, wb
读写 ====> r+, rb+, w+, wb+
追加 ====> a, a+, ab, ab+
"""
def __init__(self, file_name, mode, encoding='utf-8'):
self.file_name = file_name
self.file_io = open(file=self.file_name, mode=mode, encoding=encoding)
def __del__(self):
self.close()
def write(self, text):
self.file_io.write(text)
def read(self):
return self.file_io.read()
def close(self):
self.file_io.close()
@staticmethod
def cur_path(end_with_sep=False):
"""
获取当前程序目录
@param end_with_sep 是否在路径尾添加斜杠
"""
if end_with_sep:
return os.getcwd() + os.sep
else:
return os.getcwd()
@staticmethod
def rename(old_name, new_name):
os.rename(old_name, new_name)
@staticmethod
def copy(src, dest):
"""
复制单个文件
"""
shutil.copy(src, dest)
@staticmethod
def copy_tree(src, dest):
"""
复制整个文件夹,包含所有文件
"""
shutil.copytree(src, dest)
@staticmethod
def move(src, dest):
"""
移动单个文件,同一目录时为改名,目标目录有同名文件时覆盖文件
"""
shutil.move(src, dest)
@staticmethod
def mkdir(path):
"""
递归创建目录
"""
os.makedirs(path)
@staticmethod
def get_dir(file_path):
"""
从包含全路径的文件中提取路径并返回
"""
return os.path.dirname(file_path)
@staticmethod
def get_file(file_path):
"""
从包含全路径的文件中提取文件名并返回
"""
return os.path.basename(file_path)
@staticmethod
def rm(path):
"""
移除空文件夹或指定文件
"""
os.remove(path)
@staticmethod
def rm_tree(path):
"""
移除整个文件夹,包含所有子文件及子目录
"""
shutil.rmtree(path)
@staticmethod
def scan_files(path, suffix=None):
"""
递归获取文件夹所有文件,可过滤匹配指定后缀名suffix的文件.
返回包含全路径的文件列表
suffix 后缀名包含点,例如: .jpg
"""
result = []
for root, dirs, files in os.walk(path):
for file in files:
if suffix is not None:
if file.endswith(suffix):
result.append(os.path.join(root, file))
else:
result.append(os.path.join(root, file))
return result