python代码解析｜多个txt文本如何分类整合

2401_84010702

于 2024-04-12 14:59:21 发布

阅读量386

点赞数 3

分类专栏：程序员文章标签： python 数据库开发语言

本文链接：https://blog.csdn.net/2401_84010702/article/details/137681354

版权

程序员专栏收录该内容

237 篇文章 0 订阅

订阅专栏

import os

def extract_content_between_strings(folder_path, output_folder_path, first_string, second_string):
# 确保输出文件夹存在
if not os.path.exists(output_folder_path):
os.makedirs(output_folder_path)

# 遍历文件夹中的所有文件
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # 查找两个指定字符串之间的内容
        start_index = content.find(first_string)
        if start_index != -1:
            end_index = content.find(second_string, start_index + len(first_string))
            if end_index != -1:
                # 提取两个字符串之间的内容
                extracted_content = content[start_index + len(first_string):end_index]
                # 使用第一个字符串作为文件名
                output_filename = first_string + ".txt"
                # 构造输出文件的完整路径
                output_file_path = os.path.join(output_folder_path, output_filename)
                
                # 保存提取的内容到新文件
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(extracted_content)
                    print(f"Content extracted and saved to {output_filename} in {output_folder_path}")

设置文件夹路径

folder_path = r’C:\Users\一目\Desktop\课程\案例大赛\华为文件实验1’ # 替换为您的文件夹路径
output_folder_path = r’C:\Users\一目\Desktop\课程\案例大赛’ # 替换为您希望保存提取内容的文件夹路径
first_string = "利率风险 " # 替换为第一个指定字符串
second_string = “流动性风险” # 替换为第二个指定字符串

调用函数

extract_content_between_strings(folder_path, output_folder_path, first_string, second_string)


![](https://img-blog.csdnimg.cn/direct/f6db834d400a43f8bb01d4ea73e0077c.png)


## 2、提取多个文件中的特定字符串之间的文本并保存到文件夹

import os

def extract_and_save_content(folder_path, output_folder_path, first_string, second_string):
# 确保输出文件夹存在
if not os.path.exists(output_folder_path):
os.makedirs(output_folder_path)

# 遍历文件夹中的所有文件
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # 查找两个指定字符串之间的内容
        start_index = content.find(first_string)
        while start_index != -1:
            end_index = content.find(second_string, start_index + len(first_string))
            if end_index != -1:
                # 提取两个字符串之间的内容
                extracted_content = content[start_index + len(first_string):end_index]
                # 使用第一个字符串和原始文件名作为新文件名
                new_filename = f"{first_string}_{filename}"
                new_file_path = os.path.join(output_folder_path, new_filename)
                
                # 保存提取的内容到新文件
                with open(new_file_path, 'w', encoding='utf-8') as new_file:
                    new_file.write(extracted_content)
                    print(f"Content extracted and saved to {new_filename} in {output_folder_path}")
            
            # 继续查找下一个相同模式的内容
            start_index = content.find(first_string, end_index + len(second_string))

设置文件夹路径

folder_path = ‘path/to/your/txt/folder’ # 替换为您的文件夹路径
output_folder_path = ‘path/to/your/output/folder’ # 替换为您希望保存提取内容的文件夹路径
first_string = “开始字符串” # 替换为第一个指定字符串
second_string = “结束字符串” # 替换为第二个指定字符串

调用函数

extract_and_save_content(folder_path, output_folder_path, first_string, second_string)


![](https://img-blog.csdnimg.cn/direct/c8ec058e400f4420b821af48839f5fdc.png)


## 3、将多个文本文件合并成一个文件，并且在每个文件的内容前加上原文件的名称

import os

def merge_files_with_names(folder_path, output_file_path):
# 确保输出文件夹存在
if not os.path.exists(os.path.dirname(output_file_path)):
os.makedirs(os.path.dirname(output_file_path))

# 初始化一个空列表，用于存储合并后的内容
merged_content = []

# 遍历文件夹中的所有文件
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # 在每个文件内容前加上文件名称
            merged_content.append(f"{filename}\n")
            merged_content.append(content)
            # 添加一个分隔符，以便区分不同的文件内容
            merged_content.append("\n")

# 将合并后的内容写入到输出文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write("".join(merged_content))
    print(f"Files merged and saved to {output_file_path}")

设置文件夹路径和输出文件路径

folder_path = ‘path/to/your/txt/folder’ # 替换为您的文件夹路径
output_file_path = ‘path/to/your/output/merged_files.txt’ # 替换为您希望保存合并后文件的路径

调用函数

merge_files_with_names(folder_path, output_file_path)


其中，


1. `with open(output_file_path, 'w', encoding='utf-8') as output_file:`


	* `with` 语句是Python的上下文管理器，它确保文件在使用后会被正确关闭，即使在写入文件时发生异常也是如此。
	* `open(output_file_path, 'w', encoding='utf-8')` 函数用于打开一个文件。`output_file_path` 是新文件的完整路径，这个路径之前应该已经在脚本中定义。
	* `'w'` 模式表示写入模式。如果文件已经存在，这个模式会覆盖原有内容；如果文件不存在，会创建一个新文件。
	* `encoding='utf-8'` 参数指定文件的编码格式为UTF-8，这是一种广泛使用的字符编码，可以表示多种语言的字符。
	* `as output_file` 部分将打开的文件对象赋值给变量 `output_file`，以便在 `with` 语句块中使用。
2. `output_file.write("".join(merged_content))`


	* `output_file.write()` 是一个方法，用于向文件中写入内容。
	* `"".join(merged_content)` 是一个表达式，它将 `merged_content` 列表中的所有字符串连接成一个单一的字符串。`merged_content` 列表应该包含了所有合并后的文本内容，以及每个文件名和可能的分隔符。
	* 这个连接后的字符串被写入到之前打开的 `output_file` 文件中。
3. `print(f"Files merged and saved to {output_file_path}")`


	* 这行代码打印一条消息到控制台，通知用户文件合并操作已完成，并指出新文件保存的位置。
	* `f` 前缀表示这是一个格式化字符串（也称为 f-string），它允许在字符串中嵌入表达式。
	* `{output_file_path}` 是一个占位符，它在字符串中被变量 `output_file_path` 的值所替换。


总的来说，这段代码的作用是将多个文本文件的内容合并成一个单一的字符串，然后将这个字符串写入到一个新的文件中，并在控制台输出一条完成消息。这是文件合并操作的最后一步，将所有合并后的数据永久保存到磁盘上。


## 4、整合代码


1. 遍历指定文件夹中的所有 `.txt` 文件。
2. 从每个文件中提取两个指定字符串之间的内容。
3. 将提取的内容与原文件名合并，形成新的文件名。
4. 将所有提取的内容合并到一个单独的文件中，并在每个内容前加上相应的文件名。

import os
import re

def extract_content_between_strings(content, first_string, second_string):
# 查找两个指定字符串之间的内容
start_index = content.find(first_string)
if start_index != -1:
end_index = content.find(second_string, start_index + len(first_string))
if end_index != -1:
# 提取两个字符串之间的内容
return content[start_index + len(first_string):end_index]
return None

def merge_files_with_names(folder_path, output_folder_path, first_string, second_string):
# 确保输出文件夹存在
自我介绍一下，小编13年上海交大毕业，曾经在小公司待过，也去过华为、OPPO等大厂，18年进入阿里一直到现在。

深知大多数Python工程师，想要提升技能，往往是自己摸索成长或者是报班学习，但对于培训机构动则几千的学费，着实压力不小。自己不成体系的自学效果低效又漫长，而且极易碰到天花板技术停滞不前！

因此收集整理了一份《2024年Python开发全套学习资料》，初衷也很简单，就是希望能够帮助到想自学提升又不知道该从何学起的朋友，同时减轻大家的负担。