基于roop/insightface将视频中包含指定人脸的视频片段提取并合并成新视频

阆遤
已于 2024-09-23 00:04:23 修改
阅读量410
点赞数 3
文章标签： python roop pytorch insightface
于 2024-09-22 20:56:35 首次发布
本文链接：https://blog.csdn.net/liangma/article/details/142443219
版权
利用insightface.app.FaceAnalysis提最一个视频中包含指定人脸的视频片段，并将其合并成一个新视频，使用“buffalo_l”模型，模型需安装在代码当前目录下的.\models中。需要roop或其他支持pytorch、insightface、moviepy的环境。
pytorch安装请见我其他文章。
# cython: language_level = 3str
# -*- coding: utf-8 -*-
import os
import threading
import time
import numpy as np
import cv2
import torch
import insightface
from insightface.app.common import Face
from moviepy.config import get_setting
from moviepy.tools import subprocess_call
import argparse
from queue import Queue,Full,Empty
from collections import OrderedDict
from scipy.spatial import distance
import json

THREAD_LOCK=threading.Lock()
THREAD_SEMAPHORE = threading.Semaphore()
BIG_GPUMEM: bool = False #如果gpu内存>2G,可设置为True
ROUND_POINT: int = 3 #保留小数点位数
FFPEG_CLIP_LOWEST_FPS: int = 26 #最低输出视频帧数

gThatPath = os.path.dirname(__file__)
gOutput_Dir="Clip_Output"

gFaceApp: insightface.app.FaceAnalysis = None
gSrcFace: str = None
gInput_video: str = None
gOutput_video: str = None
gNo_audio: bool = False
gQueue_maxsize: int = 300
gFps_skip_interval: int = 1  # 跳帧间隔，1表示不跳帧，2表示跳帧2帧，以此类推
gReserve_temp_files: bool = False
gMust_have_face: bool = False

THREAD_LOCK=threading.Lock()
THREAD_SEMAPHORE = threading.Semaphore()

# 初始化InsightFace模型
def Init():
    global gFaceApp, gThatPath
    get_FaceApp()

def get_FaceApp():
    global gFaceApp,BIG_GPUMEM
    if not BIG_GPUMEM:
        if gFaceApp is None:
            with THREAD_LOCK:
                gFaceApp = insightface.app.FaceAnalysis(name='buffalo_l', root=gThatPath, providers=['CUDAExecutionProvider'])
                gFaceApp.prepare(ctx_id=0, det_size=(640, 640))
        return gFaceApp
    else:
        newFaceApp = insightface.app.FaceAnalysis(name='buffalo_m', root=gThatPath, providers=['CUDAExecutionProvider'])
        newFaceApp.prepare(ctx_id=0, det_size=(640, 640))
        return newFaceApp

def is_same_face_cosine(face1: Face, face2: Face, threshold=0.65):
    similarity=distance.cosine(face1.normed_embedding, face2.normed_embedding)
    return similarity <= threshold


def is_same_face(face1: Face, face2: Face, threshold=0.6):
    # if hasattr(face1, 'normed_embedding') and hasattr(face2, 'normed_embedding'):
    distance = np.sum(np.square(face1.normed_embedding - face2.normed_embedding))
    if distance < threshold:
        return True
    return False

def process_frame(frame, reference_face):
    faces = get_FaceApp().get(frame)
    if faces:
        for face in faces:
            if is_same_face_cosine(face, reference_face):
                return True
    return False

def ffmpeg_merge_video(concat_list_path: str, output_video_path:str, audio:bool=True):
    # '''
    # ffmpeg -y -f concat -safe 0 -i "{concat_list_path}" -c copy -map 0:v -map 0:a 
    # -fflags +genpts "{temp_video_path}"'
    # '''
    if audio:
        cmd = [get_setting("FFMPEG_BINARY"), "-y","-f","concat","-safe","0",
           "-i", concat_list_path,
           "-c","copy", "-map", "0:v", "-map", "0:a","-fflags", "+genpts", 
           output_video_path]
    else:
        cmd = [get_setting("FFMPEG_BINARY"), "-y","-f","concat","-safe","0",
           "-i", concat_list_path,
           "-c","copy", "-map", "0:v", "-fflags", "+genpts", 
           output_video_path]                       
    
    subprocess_call(cmd)

def ffmpeg_extract_subclip(filename, t1, t2, targetname=None):
    """ Makes a new video file playing video file ``filename`` between
        the times ``t1`` and ``t2``. """
    name, ext = os.path.splitext(filename)
    if not targetname:
        T1, T2 = [int(1000*t) for t in [t1, t2]]
        targetname = "%sSUB%d_%d.%s" % (name, T1, T2, ext)
    
    cmd = [get_setting("FFMPEG_BINARY"),"-y",
           "-ss", "%0.3f"%t1,
           "-i", filename,
           "-t", "%0.3f"%(t2-t1),
           "-map", "0", "-vcodec", "copy", "-acodec", "copy", targetname]
    
    subprocess_call(cmd)


def save_json(data, json_path):
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

#将一个队列保存为json文件
def save_queue_json(data_queue:Queue,json_path:str)->Queue:
    data_to_save=[]
    while not data_queue.empty():
        frame_count, result = data_queue.get()
        data_to_save.append({"frame_count":frame_count,"result":result})
        data_queue.task_done()
    save_json(data_to_save, os.path.join(json_path,"reference_result.json"))
    for item in data_to_save:
        data_queue.put((item["frame_count"], item["result"]))
    return data_queue

def extract_clip(input_video_path: str, start_time:float, end_time:float, temp_clip_path:str):
    print(f"{input_video_path} Extracting clip from {start_time:.3f} to {end_time:.3f}")    
    try:
        ffmpeg_extract_subclip(input_video_path, start_time, end_time, targetname=temp_clip_path)
    except Exception as e:
        print(f"Error extracting clip: {e}")

def ffmpeg_extractAndMergeClips(input_video_path:str, output_video_path:str,fps:float, face_timestamps:list,audio:bool=True):
    global gFps_skip_interval
    start_time:float = 0.0
    end_time:float = 0.0

    output_vidio_dir:str = os.path.dirname(output_video_path)
    clips:list = []
    for i, (start_time, end_time) in enumerate(face_timestamps):
        temp_clip_path:str=os.path.join(output_vidio_dir , f"temp_clip_{i}.mp4")
        extract_clip(input_video_path, start_time, end_time, temp_clip_path)
        clips.append(temp_clip_path)

    # Concatenate video clips using ffmpeg
    concat_list_path:str = os.path.join(output_vidio_dir , "concat_list.txt")
    with open(concat_list_path, 'w') as f:
        for clip in clips:
            f.write(f"file '{os.path.abspath(clip)}'\n")

    temp_video_path:str = os.path.join(output_vidio_dir , "temp_video.mp4")
    ffmpeg_merge_video(concat_list_path, temp_video_path, audio=audio)

    try:
        os.rename(temp_video_path, output_video_path)
    except Exception as e:
        print(f"Error renaming video: {e}")

    if not gReserve_temp_files:
        for clip in clips:
            try:
                os.remove(clip)
            except Exception as e:
                print(f"Error removing clip: {e}")

def genFaceTimestamps(result_queue:Queue, fps:float,fps_skip_interval:int)->list:
    # global gClip_interval
    face_timestamps:list = []
    frame_count:int = 0
    result:bool = False
    secPerOneFrame:float=round(1/fps,ROUND_POINT)

    clip_interval:float=round(FFPEG_CLIP_LOWEST_FPS/fps,ROUND_POINT)
    while not result_queue.empty():
        frame_count, result = result_queue.get()
        if result:
            current_time:float = round(frame_count / fps, ROUND_POINT)
            if not face_timestamps or current_time - face_timestamps[-1][1] > clip_interval*2: # 间隔时间大于2倍间隔
                face_timestamps.append([current_time, current_time])
            else:
                face_timestamps[-1][1] = current_time
            result_queue.task_done()

    i:int=0
    while i<=len(face_timestamps):
        if i<=len(face_timestamps)-1:
            if face_timestamps[i][1] == face_timestamps[i][0]:
                face_timestamps[i][1]+=secPerOneFrame*FFPEG_CLIP_LOWEST_FPS
                i=i+1
                continue
        # if i <len(face_timestamps)-1:
        #     if face_timestamps[i][1] - face_timestamps[i+1][0] < clip_interval:# 间隔时间小于指定间隔
        #         face_timestamps[i][1] = face_timestamps[i+1][1]
        #         face_timestamps.pop(i+1)
        i+=1
    return face_timestamps

def worker(frame_queue, result_dict, reference_face):
    while True:
        frame, frame_count = frame_queue.get()
        if frame is None:
            break
        result = process_frame(frame, reference_face)
        # print(f"Frame {frame_count} processed.")
        with THREAD_SEMAPHORE:
            result_dict[frame_count] = result
        frame_queue.task_done()

def get_faceFramesToQueue(worker:callable,input_video_path:str,ref_face:Face,num_threads:int=4)->Queue:
    global gFaceApp, gQueue_maxsize,gFps_skip_interval
    try:
        cap: cv2.VideoCapture = cv2.VideoCapture(input_video_path)
        fps: float = round(cap.get(cv2.CAP_PROP_FPS),ROUND_POINT)
        total_frames: int = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print(f"Video: {input_video_path}\nFrames FPS: {fps}, Total frames: {total_frames}")

        frame_count: int = 0
        frame_skip: int = gFps_skip_interval if gFps_skip_interval >0  else 1

        frame_queue: Queue = Queue(maxsize=gQueue_maxsize)
        result_dict: OrderedDict =OrderedDict()
        result_queue: Queue = Queue()

        threads: list = []
        for _ in range(num_threads):
            t: threading.Thread = threading.Thread(target=worker, args=(frame_queue, result_dict, ref_face))
            t.start()
            threads.append(t)

        while frame_count<total_frames:
            ret, frame = cap.read()
            if not ret:
                break
            frame_queue.put((frame, frame_count))
            frame_count += frame_skip
            frame_count = frame_count if frame_count < total_frames else total_frames
            if frame_skip>1:
               cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)

            print(f"Processing frame {frame_count}/{total_frames}",end="\r")

        for _ in range(num_threads):
            frame_queue.put((None, None))

        for t in threads:
            t.join()

        for frame_count in sorted(result_dict.keys()):
            result_queue.put((frame_count, result_dict[frame_count]))
        
        return fps,result_queue
    finally:
        cap.release()
        gFaceApp=None
        torch.cuda.empty_cache()
        frame_queue=None

def clipVideoForFace(srcFace: str, input_video: str, output_video: str, audio:bool=True):
    global gFaceApp,gQueue_maxsize
    try:
        reference_image: np.ndarray = cv2.imread(srcFace)
        reference_faces: list = get_FaceApp().get(reference_image)
        if len(reference_faces) == 0:
            raise ValueError("Reference image does not contain any faces.")
        reference_face: Face = reference_faces[0]
        print(f"{srcFace} Reference face loaded.")
    except:
        print("Error loading reference face.")
        return

    fps,result_queue = get_faceFramesToQueue(worker,input_video,reference_face,num_threads=4)
    output_vidio_dir = os.path.dirname(output_video)
    os.makedirs(output_vidio_dir, exist_ok=True)
    print(f"Output video directory: {output_vidio_dir}")
    
    save_queue_json(result_queue,output_vidio_dir)
 
    if gMust_have_face and not result_queue.empty():

        cv2_extractAndMergeClips(input_video, output_video, result_queue)

    else:
        face_timestamps = genFaceTimestamps(result_queue, fps, gFps_skip_interval)

        if not face_timestamps:
            print(f"File {input_video} does not contain any face timestamps found.")
            return
        
        with open(os.path.join(output_vidio_dir,"face_timestamps.txt"), 'w') as f:
            for start_time, end_time in face_timestamps:
                f.write(f"{start_time:.3f} {end_time:.3f}\n")

        print(f'\n{len(face_timestamps)} face timestamps found.')

        ffmpeg_extractAndMergeClips(input_video, output_video, fps,face_timestamps, audio=audio)#addtime=round(gFps_skip_interval/fps,3))

        print(f"Output video saved to {output_video}.")

_ProcessQueueDone=(None,None)
class SaveQueueToVidso(threading.Thread):
    def __init__(self,fps:float=None,frame_width:int=None,frame_height:int=None):
        super().__init__()
        self.queue=Queue()
        self.fps=fps
        self.frame_width=frame_width
        self.frame_height=frame_height

    def set_args(self,processFrames:list,output_video_path:str):
        with THREAD_SEMAPHORE:
            self.queue.put((processFrames,output_video_path))

    def run(self):
        while True:
            processFrames,out_path=self.queue.get()
            if processFrames is None or len(processFrames)==0:
                break
            fourcc=cv2.VideoWriter_fourcc(*'mp4v')
            out=cv2.VideoWriter(out_path, fourcc, self.fps, (self.frame_width, self.frame_height))
            for i in range(len(processFrames)):
                frame = processFrames[i]
                if frame is None:
                    break
                out.write(frame)
            self.queue.task_done()
            processFrames=None
            out.release()
        print("SaveQueueToVidso thread exit.")

#利用cv2将一个包含视频帧的队列转换成视频文件
def queueToVideo(queue:Queue,output_video_path:str,fps:float,frame_width:int,frame_height:int):
    global gQueue_maxsize
    try:
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
        while not queue.empty():
            frame = queue.get()
            if frame is None:
                break
            out.write(frame)
            queue.task_done()
        out.release()
    except Exception as e:
        print(f"Error writing video: {e}")

#根据给定的字典，依据字典的键值对，提取视频对应的帧，每提取1024帧，就合并成一个视频，
#并保存到指定目录，最后合并成1个视频
def cv2_extractAndMergeClips(input_video_path:str, output_video_path:str,result_dict:Queue):
    global gFps_skip_interval
    tempFileNames:list[str]=[]
    output_vidio_dir:str = os.path.dirname(output_video_path)
    frame_count:int = 0
    lstFrame:Queue=Queue(1024)
    #初始化视频读取器
    try:
        cap: cv2.VideoCapture = cv2.VideoCapture(input_video_path)
        fps: float = round(cap.get(cv2.CAP_PROP_FPS),ROUND_POINT)
        frame_width: int = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height: int = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames: int = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        sqtv:SaveQueueToVidso=SaveQueueToVidso(fps=fps,frame_width=frame_width,frame_height=frame_height)
        sqtv.start()
        #枚举字典的键值对，提取视频对应的帧
        # for index,(frameNum,result) in enumerate(result_dict.items()):
        while not result_dict.empty():
            frameNum,result = result_dict.get()
            if result:
                for i in range(int(gFps_skip_interval)):
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frameNum+i)
                    ret, frame = cap.read()
                    if not ret:
                        break
                    while True:
                        try:
                            lstFrame.put(frame,block=False)
                            frame_count+=1
                            print(f'Processing frame {frameNum+i}/{total_frames}',end='\r')
                            break
                        except Full:
                            temp_video_path:str = os.path.join(output_vidio_dir , f"temp_video_{frame_count}.mp4")
                            procesFrames=list(lstFrame.queue)
                            lstFrame.queue.clear()
                            sqtv.set_args(procesFrames,temp_video_path)
                            tempFileNames.append(temp_video_path)                                                                        
        if not lstFrame.empty():
        #将lstFrame中的帧合并成一个视频
            temp_video_path:str = os.path.join(output_vidio_dir , f"temp_video_{frame_count}.mp4")
            procesFrames=list(lstFrame.queue)
            lstFrame.queue.clear()
            sqtv.set_args(procesFrames,temp_video_path)
            tempFileNames.append(temp_video_path)
            #清空lstFrame
            
        #将tempFileNames中的视频合并成1个视频
        sqtv.queue.put(_ProcessQueueDone)
        sqtv.join()
        if len(tempFileNames)>1:
            concat_list_path:str = os.path.join(output_vidio_dir , "concat_list.txt")
            with open(concat_list_path, 'w') as f:
                for temp_video_path in tempFileNames:
                    f.write(f"file '{os.path.abspath(temp_video_path)}'\n")
            temp_video_path:str = os.path.join(output_vidio_dir , "temp_video_merged.mp4")
            ffmpeg_merge_video(concat_list_path, temp_video_path, audio=False)
            os.rename(temp_video_path, output_video_path)
            if not gReserve_temp_files:
                for temp_video_path in tempFileNames:
                    os.remove(temp_video_path)
        else:
            os.rename(tempFileNames[0], output_video_path)
        print(f"Output video saved to {output_video_path}.")

    except Exception as e:
        print(f"Error processing video: {e}")
    finally:
        cap.release()
    
def argParse():
    global gSrcFace, gInput_video, gOutput_video, gNo_audio,gFps_skip_interval,\
        gReserve_temp_files,BIG_GPUMEM,gMust_have_face#,gClip_interval
    parser = argparse.ArgumentParser()
    parser.add_argument("srcFace", help="Path to reference face image.")
    parser.add_argument("input_video", help="Path to input video.")
    parser.add_argument("--must_has_face",default=False,action="store_true",help="Must have face in frame.")
    parser.add_argument("-o","--output_video", default=None, help="Path to output video.")
    parser.add_argument("--no_audio", default=False, action="store_true", help="Disable audio in output video.")
    parser.add_argument("--big_gpumem", default=False, action="store_true", help="Default is False. If you GPU memory >2G,Set it.")
    # parser.add_argument("--clip_interval", default=0.2, type=float, help="Interval between clips in seconds.")
    parser.add_argument("--reserve_temp_files", default=False, action="store_true", help="Keep temporary files after merging.")
    parser.add_argument("--fps_skip_interval", default=2, type=int, help="Interval between frames to skip in seconds.")
    args = parser.parse_args()
    gSrcFace = args.srcFace
    gInput_video = args.input_video
    gOutput_video = args.output_video if args.output_video else \
        os.path.join(os.path.dirname(gInput_video) , 
                f"{gOutput_Dir}\\clip_output_{os.path.splitext(os.path.basename(gInput_video))[0]}.mp4")
    gNo_audio = args.no_audio
    BIG_GPUMEM = args.big_gpumem
    # gClip_interval = args.clip_interval
    gReserve_temp_files = args.reserve_temp_files
    gFps_skip_interval = args.fps_skip_interval
    gMust_have_face = args.must_has_face

def main():
    global gSrcFace, gInput_video, gOutput_video, gNo_audio
    try:
        argParse()  
    except:
        print("Error parsing arguments.")
        exit(1)

    startTime=time.time()
    Init()
    clipVideoForFace(gSrcFace, gInput_video, gOutput_video, not gNo_audio)
    endTime=time.time()
    print(f"Total time taken: {endTime-startTime:.2f} seconds.")

VERSION="0.5.0"
if __name__ == '__main__': 
    main()
基本应用：
1、指定srcFace,用于识别视频中的人脸，可利用视频截屏生成。
2、指定input_video,用于的最指定人脸的视频
3、--fps_skip_interval：用于指定提取视频帧时的步进间隔，最小为1，默认为2，建议用4，即每4帧进行一次识别，此数据越大，速度越快，但是识别精度就越低。10分钟的视频，用此项设为4，处理完成约7分钟左右。
4、--must_has_face：是确保所有帧包含指定人脸的方式，但因为是用提取的视频帧直接生成，所以没有音频。
5、FFPEG_CLIP_LOWEST_FPS: int = 26，这个是ffmpeg提取视频帧时，提取的最少帧数，所以即使指定must_has_face，也可能包含25个不含指定人脸的帧。
6、代码仅用于参考。