python代码解析｜多个txt文本如何分类整合

八八嘿嘿

已于 2024-03-28 21:14:26 修改

阅读量703

点赞数 14

分类专栏： python文件处理文章标签： python java linux

于 2024-03-28 21:13:23 首次发布

本文链接：https://blog.csdn.net/weixin_50381726/article/details/137114093

版权

python文件处理专栏收录该内容

5 篇文章 0 订阅

订阅专栏

上一篇文章提取文本并分别转入txt文件，对于像年报一样有着类似结构的文本如何整合处理分析呢？

1、提取单个文件中指定字符串之间的文本并保存到文件夹

import os

def extract_content_between_strings(folder_path, output_folder_path, first_string, second_string):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            # 读取文件内容
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # 查找两个指定字符串之间的内容
            start_index = content.find(first_string)
            if start_index != -1:
                end_index = content.find(second_string, start_index + len(first_string))
                if end_index != -1:
                    # 提取两个字符串之间的内容
                    extracted_content = content[start_index + len(first_string):end_index]
                    # 使用第一个字符串作为文件名
                    output_filename = first_string + ".txt"
                    # 构造输出文件的完整路径
                    output_file_path = os.path.join(output_folder_path, output_filename)
                    
                    # 保存提取的内容到新文件
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        output_file.write(extracted_content)
                        print(f"Content extracted and saved to {output_filename} in {output_folder_path}")

# 设置文件夹路径
folder_path = r'C:\Users\一目\Desktop\课程\案例大赛\华为文件实验1'  # 替换为您的文件夹路径
output_folder_path = r'C:\Users\一目\Desktop\课程\案例大赛'  # 替换为您希望保存提取内容的文件夹路径
first_string = "利率风险 "  # 替换为第一个指定字符串
second_string = "流动性风险"  # 替换为第二个指定字符串

# 调用函数
extract_content_between_strings(folder_path, output_folder_path, first_string, second_string)

2、提取多个文件中的特定字符串之间的文本并保存到文件夹

import os

def extract_and_save_content(folder_path, output_folder_path, first_string, second_string):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            # 读取文件内容
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # 查找两个指定字符串之间的内容
            start_index = content.find(first_string)
            while start_index != -1:
                end_index = content.find(second_string, start_index + len(first_string))
                if end_index != -1:
                    # 提取两个字符串之间的内容
                    extracted_content = content[start_index + len(first_string):end_index]
                    # 使用第一个字符串和原始文件名作为新文件名
                    new_filename = f"{first_string}_{filename}"
                    new_file_path = os.path.join(output_folder_path, new_filename)
                    
                    # 保存提取的内容到新文件
                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
                        new_file.write(extracted_content)
                        print(f"Content extracted and saved to {new_filename} in {output_folder_path}")
                
                # 继续查找下一个相同模式的内容
                start_index = content.find(first_string, end_index + len(second_string))

# 设置文件夹路径
folder_path = 'path/to/your/txt/folder'  # 替换为您的文件夹路径
output_folder_path = 'path/to/your/output/folder'  # 替换为您希望保存提取内容的文件夹路径
first_string = "开始字符串"  # 替换为第一个指定字符串
second_string = "结束字符串"  # 替换为第二个指定字符串

# 调用函数
extract_and_save_content(folder_path, output_folder_path, first_string, second_string)

3、将多个文本文件合并成一个文件，并且在每个文件的内容前加上原文件的名称

import os

def merge_files_with_names(folder_path, output_file_path):
    # 确保输出文件夹存在
    if not os.path.exists(os.path.dirname(output_file_path)):
        os.makedirs(os.path.dirname(output_file_path))

    # 初始化一个空列表，用于存储合并后的内容
    merged_content = []

    # 遍历文件夹中的所有文件
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            # 读取文件内容
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # 在每个文件内容前加上文件名称
                merged_content.append(f"{filename}\n")
                merged_content.append(content)
                # 添加一个分隔符，以便区分不同的文件内容
                merged_content.append("\n")

    # 将合并后的内容写入到输出文件
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write("".join(merged_content))
        print(f"Files merged and saved to {output_file_path}")

# 设置文件夹路径和输出文件路径
folder_path = 'path/to/your/txt/folder'  # 替换为您的文件夹路径
output_file_path = 'path/to/your/output/merged_files.txt'  # 替换为您希望保存合并后文件的路径

# 调用函数
merge_files_with_names(folder_path, output_file_path)

其中，

with open(output_file_path, 'w', encoding='utf-8') as output_file:
- with 语句是Python的上下文管理器，它确保文件在使用后会被正确关闭，即使在写入文件时发生异常也是如此。
- open(output_file_path, 'w', encoding='utf-8') 函数用于打开一个文件。output_file_path 是新文件的完整路径，这个路径之前应该已经在脚本中定义。
- 'w' 模式表示写入模式。如果文件已经存在，这个模式会覆盖原有内容；如果文件不存在，会创建一个新文件。
- encoding='utf-8' 参数指定文件的编码格式为UTF-8，这是一种广泛使用的字符编码，可以表示多种语言的字符。
- as output_file 部分将打开的文件对象赋值给变量 output_file，以便在 with 语句块中使用。
output_file.write("".join(merged_content))
- output_file.write() 是一个方法，用于向文件中写入内容。
- "".join(merged_content) 是一个表达式，它将 merged_content 列表中的所有字符串连接成一个单一的字符串。merged_content 列表应该包含了所有合并后的文本内容，以及每个文件名和可能的分隔符。
- 这个连接后的字符串被写入到之前打开的 output_file 文件中。
print(f"Files merged and saved to {output_file_path}")
- 这行代码打印一条消息到控制台，通知用户文件合并操作已完成，并指出新文件保存的位置。
- f 前缀表示这是一个格式化字符串（也称为 f-string），它允许在字符串中嵌入表达式。
- {output_file_path} 是一个占位符，它在字符串中被变量 output_file_path 的值所替换。

总的来说，这段代码的作用是将多个文本文件的内容合并成一个单一的字符串，然后将这个字符串写入到一个新的文件中，并在控制台输出一条完成消息。这是文件合并操作的最后一步，将所有合并后的数据永久保存到磁盘上。

4、整合代码

遍历指定文件夹中的所有 .txt 文件。
从每个文件中提取两个指定字符串之间的内容。
将提取的内容与原文件名合并，形成新的文件名。
将所有提取的内容合并到一个单独的文件中，并在每个内容前加上相应的文件名。

import os
import re

def extract_content_between_strings(content, first_string, second_string):
    # 查找两个指定字符串之间的内容
    start_index = content.find(first_string)
    if start_index != -1:
        end_index = content.find(second_string, start_index + len(first_string))
        if end_index != -1:
            # 提取两个字符串之间的内容
            return content[start_index + len(first_string):end_index]
    return None

def merge_files_with_names(folder_path, output_folder_path, first_string, second_string):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # 初始化一个空列表，用于存储合并后的内容
    merged_content = []

    # 遍历文件夹中的所有文件
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            # 读取文件内容
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # 提取两个指定字符串之间的内容
            extracted_content = extract_content_between_strings(content, first_string, second_string)
            if extracted_content is not None:
                # 在提取的内容前加上文件名称
                merged_content.append(f"{filename}\n")
                merged_content.append(extracted_content)
                # 添加一个分隔符，以便区分不同的文件内容
                merged_content.append("\n\n")

    # 将合并后的内容写入到输出文件
    output_filename = "merged_files.txt"
    output_file_path = os.path.join(output_folder_path, output_filename)
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write("".join(merged_content))
    print(f"Files merged and saved to {output_file_path}")

# 设置文件夹路径和输出文件夹路径
folder_path = 'path/to/your/txt/folder'  # 替换为您的文件夹路径
output_folder_path = 'path/to/your/output/folder'  # 替换为您希望保存合并后文件的文件夹路径
first_string = "开始字符串"  # 替换为第一个指定字符串
second_string = "结束字符串"  # 替换为第二个指定字符串

# 调用函数
merge_files_with_names(folder_path, output_folder_path, first_string, second_string)

将所有指标按类储存到各自文档中

def extract_and_save_content_for_string_pairs(folder_path, output_folder_path, string_list):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # 创建字符串对列表
    string_pairs = [(string_list[i], string_list[i + 1]) for i in range(len(string_list) - 1)]

    # 遍历字符串对
    for s1, s2 in string_pairs:
        # 初始化一个字典来存储提取的内容
        extracted_contents = {}

        # 遍历文件夹中的所有文件
        for filename in sorted(os.listdir(folder_path)):
            if filename.endswith('.txt'):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                    # 提取两个指定字符串之间的内容
                    extracted_content = extract_content_between_strings(content, s1, s2)
                    if extracted_content is not None:
                        extracted_contents[filename] = extracted_content

        # 为每个字符串对保存提取的内容到单独的文件
        output_filename = f"{s1}_{s2}.txt"
        output_file_path = os.path.join(output_folder_path, output_filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for filename, content in extracted_contents.items():
                output_file.write(f"{filename} - {s1}:\n")
                output_file.write(content)
                output_file.write("\n\n")
        print(f"Content for {s1} and {s2} extracted and saved to {output_filename}")

# 设置文件夹路径和输出文件夹路径
folder_path = 'path/to/your/txt/folder'  # 替换为您的文件夹路径
output_folder_path = 'path/to/your/output/folder'  # 替换为您希望保存提取内容的文件夹路径
string_list = ['XXXXXXXX']

# 调用函数处理每个字符串对
extract_and_save_content_for_string_pairs(folder_path, output_folder_path, string_list)

八八嘿嘿

关注

14
点赞
踩
15

收藏

觉得还不错? 一键收藏
1
评论
python代码解析｜多个txt文本如何分类整合

总的来说，这段代码的作用是将多个文本文件的内容合并成一个单一的字符串，然后将这个字符串写入到一个新的文件中，并在控制台输出一条完成消息。这是文件合并操作的最后一步，将所有合并后的数据永久保存到磁盘上。
复制链接

扫一扫

专栏目录