python多进程数据处理

苏沐棠an

已于 2023-07-27 23:38:23 修改

阅读量357

点赞数

文章标签： python

于 2023-07-27 23:34:08 首次发布

本文链接：https://blog.csdn.net/qq_55927986/article/details/131970604

版权

python多进程数据处理

对于python多进程的基础写法，推荐参考这篇学习：

Python 多核并行计算 - 陈乐群的文章 - 知乎 https://zhuanlan.zhihu.com/p/24311810

与一众被科研耽误的影帝们一起拍摄了10G的异常事件数据集后需要进行一次视频数据预处理，但是简单的python代码实现在数据量庞大的情况下事件开销比较大，所以尝试使用并行计算进行加速。

python的GIL机制使得常规情况下python的多线程机制在计算密集型任务上不能起到实质性的帮助，但是没关系~ 多进程方式（数据并行）允许数据被分发到多个CPU上进行处理而互不干涉，很适合现代处理器的多CPU结构。

在使用多线程后，在某台上古服务器（8核i3）上的处理速度由4h变成了1h，体验到快乐的小蔡不禁将目光投向了实验室刚到的128核刀片服务器.

接下来进入python数据处理的实例：

现在有一些.mp4格式的视频被保存在了多级文件夹下，文件树深度不可知，并且其中混杂了其他格式的文件。现在需要将其中的每个.mp4视频文件逐个处理：1.切成64帧的大小，保存到于视频文件同名的文件夹下；2，将所有视频的分辨率resize成某个特定尺寸

请添加图片描述
在这里插入图片描述

库声明

import multiprocessing  #多进程库
import os
import cv2
from tqdm import tqdm	#进度条

线程池初始化

def multiprocess_init(use_cores=32):
    cores = min(use_cores, multiprocessing.cpu_count())	#确保进程数不多于核心数，io密集型任务可以把这行注释掉使用更多进程
    print("cores：", cores)
    pool = multiprocessing.Pool(processes=cores)
    return pool, cores

获取需要处理的文件列表

取得需要的数据并做成列表，后续通过多进程并行处理这些数据

def get_video_files(path):  # 获取路径下的所有视频文件的绝对路径列表
    video_files = []
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath) and (filename.endswith('.mp4') or filename.endswith('.avi')):
            video_files.append(filepath)
        elif os.path.isdir(filepath):
            video_files += get_video_files(filepath)
    return video_files

实现对当个文件的处理函数

这个函数的内容随任务变化。函数比较长，完整的函数就直接放在最后的完整代码里面了

def video_snipping(video_file, output_dir, frame=64, resize=None):
    """
    功能：将视频video_file切分成frame帧长的片段，存放在output_dir
    :param video_file: 待切分视频的绝对路径
    :param output_dir: 裁切片段的输出目录
    :param frame:  片段长度
    :return:None
    """
def video_snipping_multiprocess(input, frame=64, resize=(565, 320)):
    """
    功能：
        将视频video_file
        resize成resize参数指定大小
        切分成frame帧长的片段
        存放在output_dir下的同名文件夹下
    （多进程版本，区别在输入参数形式不同，这里将多参数压包为input传入）

    :param input:包括
        :param video_file: 待切分视频的绝对路径
        :param output_dir: 裁切片段的输出目录
    :param frame:  片段长度
    :param resize: 裁切画面的大小
    :return:None
    """
	video_file, output_dir = input
    ...

主函数

多进程下报错不好调试，所以预留了单线程的实现。

if __name__ == '__main__':
    video_folder = r"E:\tmp\NBUabnormal"
    output_folder = r"E:\tmp\NBUabnormal_out"

    video_files = get_video_files(video_folder)
    total = len(video_files)

    video_files_output = []
    # for file in video_files:   #输出为文件
    #     video_files_output.append(file.replace(video_folder, output_folder))
    for file in video_files:  # 输出为文件夹
        output_path = os.path.splitext(file.replace(video_folder, output_folder))[0]  # 获取不包含后缀的文件名作为文件夹名
        video_files_output.append(output_path)

    ds = list(zip(video_files, video_files_output))  # 待处理数据列表

    # 单进程处理方式（调试）
    for video_file, output_file in tqdm(ds):
        video_snipping(video_file, output_file, resize=(565, 320))

    # 多进程处理方式
    # pool, cores = multiprocess_init()
    # with tqdm(total=total) as pbar:
    #     for _ in pool.imap_unordered(video_snipping_multiprocess, ds):  # pool.map(f,ds)
    #         pbar.update(1)

完整代码

"""
将视频分割成小段，同时resize
实现了多进程版本
"""
import multiprocessing
import os
import cv2
from tqdm import tqdm


def multiprocess_init(use_cores=1024):
    cores = min(use_cores, multiprocessing.cpu_count())
    print("cores：", cores)
    pool = multiprocessing.Pool(processes=cores)
    return pool, cores


def get_video_files(path):  # 获取路径下的所有视频文件的绝对路径列表
    video_files = []
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath) and (filename.endswith('.mp4') or filename.endswith('.avi')):
            video_files.append(filepath)
        elif os.path.isdir(filepath):
            video_files += get_video_files(filepath)
    return video_files


def video_snipping(video_file, output_dir, frame=64, resize=None):
    """
    将视频video_file切分成frame帧长的片段，存放在output_dir
    :param video_file: 待切分视频的绝对路径
    :param output_dir: 裁切片段的输出目录
    :param frame:  片段长度
    :return:None
    """
    # 创建输出文件夹
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 读取视频
    cap = cv2.VideoCapture(video_file)

    # 获取视频帧率和总帧数
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 计算每个片段的帧数
    num_frames_per_segment = frame  # default:64
    num_segments = (total_frames + num_frames_per_segment - 1) // num_frames_per_segment  # 会保存最后不足64的片段

    # 逐个保存视频片段
    for i in range(num_segments):
        # 读取视频片段
        frames = []
        for j in range(num_frames_per_segment):
            ret, frame = cap.read()
            if not ret:
                break
            if resize:
                frame = cv2.resize(frame, resize, interpolation=cv2.INTER_AREA)
            frames.append(frame)

        # 保存视频片段
        segment_name = os.path.splitext(os.path.basename(video_file))[0] + f"_{i + 1}.mp4"
        segment_path = os.path.join(output_dir, segment_name)

        height, width, _ = frames[0].shape
        if resize:
            width, height = resize

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        writer = cv2.VideoWriter(segment_path, fourcc, fps, (width, height))

        for frame in frames:
            writer.write(frame)
        writer.release()

    # 释放资源
    cap.release()


def video_snipping_multiprocess(input, frame=64, resize=(565, 320)):
    """
    功能：
        将视频video_file
        resize成resize参数指定大小
        切分成frame帧长的片段
        存放在output_dir下的同名文件夹下
    （多进程版本，区别在输入参数形式不同，这里将多参数压包为input传入）

    :param input:包括
        :param video_file: 待切分视频的绝对路径
        :param output_dir: 裁切片段的输出目录
    :param frame:  片段长度
    :param resize: 裁切画面的大小
    :return:None
    """
    video_file, output_dir = input

    # 创建输出文件夹
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 读取视频
    cap = cv2.VideoCapture(video_file)

    # 获取视频帧率和总帧数
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 计算每个片段的帧数
    num_frames_per_segment = frame  # default:64
    num_segments = (total_frames + num_frames_per_segment - 1) // num_frames_per_segment  # 会保存最后不足64的片段

    # 逐个保存视频片段
    for i in range(num_segments):
        # 读取视频片段
        frames = []
        for j in range(num_frames_per_segment):
            ret, frame = cap.read()
            if not ret:  # ret 图像是否成功读取
                break
            if resize:
                frame = cv2.resize(frame, resize, interpolation=cv2.INTER_AREA)
            frames.append(frame)

        # 保存视频片段
        segment_name = os.path.splitext(os.path.basename(video_file))[0] + f"_{i + 1}.mp4"
        segment_path = os.path.join(output_dir, segment_name)

        height, width, _ = frames[0].shape
        if resize:
            width, height = resize

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        writer = cv2.VideoWriter(segment_path, fourcc, fps, (width, height))

        for frame in frames:
            writer.write(frame)
        writer.release()

    # 释放资源
    cap.release()


if __name__ == '__main__':
    video_folder = r"E:\tmp\NBUabnormal"
    output_folder = r"E:\tmp\NBUabnormal_out"

    video_files = get_video_files(video_folder)
    total = len(video_files)

    video_files_output = []
    # for file in video_files:   #输出为文件
    #     video_files_output.append(file.replace(video_folder, output_folder))
    for file in video_files:  # 输出为文件夹
        output_path = os.path.splitext(file.replace(video_folder, output_folder))[0]  # 获取不包含后缀的文件名作为文件夹名
        video_files_output.append(output_path)

    ds = list(zip(video_files, video_files_output))  # 待处理数据列表

    # 单进程处理方式（调试）
    for video_file, output_file in tqdm(ds):
        video_snipping(video_file, output_file, resize=(565, 320))

    # 多进程处理方式
    # pool, cores = multiprocess_init()
    # with tqdm(total=total) as pbar:
    #     for _ in pool.imap_unordered(video_snipping_multiprocess, ds):  # pool.map(f,ds)
    #         pbar.update(1)