python获取小说内容-CSDN博客

本文链接：https://blog.csdn.net/cf8833/article/details/147562499

我希望能对特定网址，进行解析，获取里面的内容，保存成本地文件，用python实现

step1:获取小说内容，保存本地文件

import os
import requests
from bs4 import BeautifulSoup

def generate_filename(index):
    """生成两位字母组合文件名，按aa,ab...zz顺序"""
    if index < 0 or index >= 26 * 26:
        raise ValueError("Index out of range (0-675)")
    first = index // 26
    second = index % 26
    return f"{chr(97 + first)}{chr(97 + second)}.txt"

def save_novel_content(start_page, end_page, novel_id, save_dir):
    """
    保存小说内容到本地
    参数：
    start_page: 起始页码
    end_page: 结束页码（包含）
    novel_id: 小说ID（如示例中的1479359）
    save_dir: 保存路径
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for i in range(start_page, end_page + 1):
        url = f"https://wap.faloo.com/{novel_id}_{i}.html"
        try:
            # 获取网页内容
            response = requests.get(url, headers=headers)
            response.encoding = 'gbk'
            soup = BeautifulSoup(response.text, 'html.parser')

            # 提取原始HTML内容（保留标签）
            content_div = soup.find('div', class_='nodeContent')
            content = str(content_div) if content_div else ""

            # 生成文件名并保存
            filename = generate_filename(i - start_page)
            filepath = os.path.join(save_dir, filename)

            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)

            print(f"已保存第{i}页内容到：{filepath}")

        except Exception as e:
            print(f"处理第{i}页时出错：{str(e)}")

# 使用示例
if __name__ == "__main__":
    save_path = r"C:\Users\wangrusheng\Downloads\dce"
    novel_id = 1479359
    start_page = 1
    end_page = 20

    save_novel_content(
        start_page=start_page,
        end_page=end_page,
        novel_id=novel_id,
        save_dir=save_path
    )

step2:合并内容到一个文件

import os
import re


def merge_txt_files(input_dir, output_file):
    """
    将指定目录下所有符合 aa,ab,...,zz 命名规则的txt文件按顺序合并到新文件。

    :param input_dir: 输入目录路径
    :param output_file: 合并后的输出文件路径
    :return: 合并的文件数量
    """
    # 获取目录下所有文件
    all_files = os.listdir(input_dir)

    # 匹配两个字母组成的txt文件名（不区分大小写）
    pattern = re.compile(r'^[a-zA-Z]{2}\.txt$', re.IGNORECASE)
    matched_files = []

    for filename in all_files:
        if pattern.match(filename):
            basename = os.path.splitext(filename)[0].lower()
            if len(basename) == 2 and basename.isalpha():
                matched_files.append((basename, filename))

    # 按字母顺序排序
    matched_files.sort(key=lambda x: x[0])

    # 合并文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for basename, filename in matched_files:
            filepath = os.path.join(input_dir, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as infile:
                    outfile.write(infile.read())
                    # 可选：在每个文件内容后添加换行符
                    # outfile.write('\n')
            except Exception as e:
                print(f"处理文件 {filename} 时出错: {e}")

    return len(matched_files)


# 示例用法
input_dir = r'C:\Users\wangrusheng\Downloads\dce'
output_file = r'C:\Users\wangrusheng\Downloads\dce\merged.txt'
count = merge_txt_files(input_dir, output_file)
print(f"成功合并 {count} 个文件。")

step3:去除html网页标签

import re


def remove_html_tags(file_path):
    # 读取文件内容
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # 使用正则表达式去除HTML标签
    cleaned_content = re.sub(r'<[^>]+>', '', content)

    # 覆盖写入原文件（若需要保留原文件，可修改输出路径）
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)


# 使用示例（将路径替换为你的实际路径）
remove_html_tags(r'C:\Users\wangrusheng\Downloads\dce\merged.txt')

stwp4:拆分文件

import os


def split_file_by_char(input_file_path, chunk_size=2050):
    # 读取原始文件内容
    with open(input_file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # 分割成指定大小的块
    chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]

    # 检查文件数量是否超出命名容量
    if len(chunks) > 26 * 26:
        raise ValueError(f"生成文件数量超过最大限制（676），当前需要生成 {len(chunks)} 个文件")

    # 生成并保存文件
    base_dir = os.path.dirname(input_file_path)
    for index, chunk in enumerate(chunks):
        # 计算文件名索引
        first_char = chr(97 + index // 26)  # 首字母
        second_char = chr(97 + index % 26)  # 次字母
        filename = f"{first_char}{second_char}.txt"

        # 写入文件
        output_path = os.path.join(base_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(chunk)

    return len(chunks)


# 使用示例

split_file_by_char(r'C:\Users\wangrusheng\Downloads\zb\merged.txt')

end