我希望能对特定网址,进行解析,获取里面的内容,保存成本地文件,用python实现
step1:获取小说内容,保存本地文件
import os
import requests
from bs4 import BeautifulSoup
def generate_filename(index):
"""生成两位字母组合文件名,按aa,ab...zz顺序"""
if index < 0 or index >= 26 * 26:
raise ValueError("Index out of range (0-675)")
first = index // 26
second = index % 26
return f"{chr(97 + first)}{chr(97 + second)}.txt"
def save_novel_content(start_page, end_page, novel_id, save_dir):
"""
保存小说内容到本地
参数:
start_page: 起始页码
end_page: 结束页码(包含)
novel_id: 小说ID(如示例中的1479359)
save_dir: 保存路径
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for i in range(start_page, end_page + 1):
url = f"https://wap.faloo.com/{novel_id}_{i}.html"
try:
# 获取网页内容
response = requests.get(url, headers=headers)
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'html.parser')
# 提取原始HTML内容(保留标签)
content_div = soup.find('div', class_='nodeContent')
content = str(content_div) if content_div else ""
# 生成文件名并保存
filename = generate_filename(i - start_page)
filepath = os.path.join(save_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f"已保存第{i}页内容到:{filepath}")
except Exception as e:
print(f"处理第{i}页时出错:{str(e)}")
# 使用示例
if __name__ == "__main__":
save_path = r"C:\Users\wangrusheng\Downloads\dce"
novel_id = 1479359
start_page = 1
end_page = 20
save_novel_content(
start_page=start_page,
end_page=end_page,
novel_id=novel_id,
save_dir=save_path
)
step2:合并内容到一个文件
import os
import re
def merge_txt_files(input_dir, output_file):
"""
将指定目录下所有符合 aa,ab,...,zz 命名规则的txt文件按顺序合并到新文件。
:param input_dir: 输入目录路径
:param output_file: 合并后的输出文件路径
:return: 合并的文件数量
"""
# 获取目录下所有文件
all_files = os.listdir(input_dir)
# 匹配两个字母组成的txt文件名(不区分大小写)
pattern = re.compile(r'^[a-zA-Z]{2}\.txt$', re.IGNORECASE)
matched_files = []
for filename in all_files:
if pattern.match(filename):
basename = os.path.splitext(filename)[0].lower()
if len(basename) == 2 and basename.isalpha():
matched_files.append((basename, filename))
# 按字母顺序排序
matched_files.sort(key=lambda x: x[0])
# 合并文件
with open(output_file, 'w', encoding='utf-8') as outfile:
for basename, filename in matched_files:
filepath = os.path.join(input_dir, filename)
try:
with open(filepath, 'r', encoding='utf-8') as infile:
outfile.write(infile.read())
# 可选:在每个文件内容后添加换行符
# outfile.write('\n')
except Exception as e:
print(f"处理文件 {filename} 时出错: {e}")
return len(matched_files)
# 示例用法
input_dir = r'C:\Users\wangrusheng\Downloads\dce'
output_file = r'C:\Users\wangrusheng\Downloads\dce\merged.txt'
count = merge_txt_files(input_dir, output_file)
print(f"成功合并 {count} 个文件。")
step3:去除html网页标签
import re
def remove_html_tags(file_path):
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 使用正则表达式去除HTML标签
cleaned_content = re.sub(r'<[^>]+>', '', content)
# 覆盖写入原文件(若需要保留原文件,可修改输出路径)
with open(file_path, 'w', encoding='utf-8') as file:
file.write(cleaned_content)
# 使用示例(将路径替换为你的实际路径)
remove_html_tags(r'C:\Users\wangrusheng\Downloads\dce\merged.txt')
stwp4:拆分文件
import os
def split_file_by_char(input_file_path, chunk_size=2050):
# 读取原始文件内容
with open(input_file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 分割成指定大小的块
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
# 检查文件数量是否超出命名容量
if len(chunks) > 26 * 26:
raise ValueError(f"生成文件数量超过最大限制(676),当前需要生成 {len(chunks)} 个文件")
# 生成并保存文件
base_dir = os.path.dirname(input_file_path)
for index, chunk in enumerate(chunks):
# 计算文件名索引
first_char = chr(97 + index // 26) # 首字母
second_char = chr(97 + index % 26) # 次字母
filename = f"{first_char}{second_char}.txt"
# 写入文件
output_path = os.path.join(base_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(chunk)
return len(chunks)
# 使用示例
split_file_by_char(r'C:\Users\wangrusheng\Downloads\zb\merged.txt')
end