把数据集的每个实验对象的summary文件整合到一个txt里:
import os
# 指定包含子文件夹的主文件夹路径
main_folder = '.../CHB-MIT' # 替换为实际的主文件夹路径
# 指定要保存合并内容的文件
output_file = '.../seizure_summary.txt' # 替换为你想要的输出文件名
# 指定要删除的关键词
keywords_to_remove = ['Channel'] # 替换为你的关键词列表
# 打开输出文件以便写入内容
with open(output_file, 'w', encoding='utf-8') as output:
# 遍历主文件夹中的子文件夹
for root, dirs, files in os.walk(main_folder):
for file in files:
if file.endswith('.txt'): # 仅处理.txt文件
file_path = os.path.join(root, file)
# 打开并读取txt文件的内容
with open(file_path, 'r', encoding='utf-8') as txt_file:
content_lines = txt_file.readlines()
# 过滤包含关键词的行
filtered_lines = [line for line in content_lines if not any(keyword in line for keyword in keywords_to_remove)]
# 将过滤后的内容写入输出文件
output.writelines(filtered_lines)
print("文本文件合并并删除关键词所在行完成。")
然后根据整合的seizure_summary文件(自己取的名),将发作部分的数据分割出来。
import os
from scipy.io import loadmat, savemat
# 定义文本文件路径和数据文件夹路径
text_file_path = '.../seizure_summary.txt' # 文本文件路径
data_folder = '.../CHB-MAT-Filter' # 包含MAT文件的文件夹路径
output_folder = '.../CHB-MAT-Filter/seizure_segment' # 保存分割出来的发作数据的文件夹路径
# 创建输出文件夹(如果不存在)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 读取文本文件并解析信息
data = []
with open(text_file_path, 'r') as file:
lines = file.readlines()
record = {}
for line in lines:
if line.startswith('File_Name'):
if record: # 如果之前的记录不为空,添加到data列表
data.append(record.copy())
record['file_name'] = line.split(':')[1].strip()
record['time_ranges'] = []
elif line.startswith('Seizure_Start_Time'):
try:
start_time = int(line.split(':')[1].strip().split('seconds')[0])
except ValueError:
# 处理无法转换为整数的情况
print(f"Error parsing start time in line: {line}")
continue
record['time_ranges'].append({'start_time': start_time, 'end_time': None})
elif line.startswith('Seizure_End_Time'):
try:
end_time = int(line.split(':')[1].strip().split('seconds')[0])
except ValueError:
# 处理无法转换为整数的情况
print(f"Error parsing end time in line: {line}")
continue
if record['time_ranges']:
record['time_ranges'][-1]['end_time'] = end_time
# 遍历数据文件夹,切割MAT数据并保存到一个文件夹
for record in data:
for time_range in record['time_ranges']:
for root, dirs, files in os.walk(data_folder):
for file in files:
if file.endswith('.mat') and record['file_name'] in file:
file_path = os.path.join(root, file)
try:
mat_data = loadmat(file_path)
except FileNotFoundError:
print(f"MAT file not found: {file_path}")
continue
start_index = time_range['start_time'] # 开始时间的索引
end_index = time_range['end_time'] # 结束时间的索引
# 如果开始时间或结束时间缺失,跳过这个时间段
if start_index is None or end_index is None:
print(f"Start_time or end_time missing for record: {record['file_name']}")
continue
# 切割MAT数据
sliced_data = {}
for key, value in mat_data.items():
if key.startswith('__') and key.endswith('__'):
# 忽略特殊变量,通常是MAT文件中的元数据
continue
sliced_data[key] = value[:, start_index:end_index]
# 保存切割后的MAT数据到输出文件夹
output_file_name = f"sliced_{record['file_name']}_{start_index}_{end_index}.mat"
output_file_path = os.path.join(output_folder, output_file_name)
savemat(output_file_path, sliced_data)
print('分割完成')