利用insightface.app.FaceAnalysis提最一个视频中包含指定人脸的视频片段,并将其合并成一个新视频,使用“buffalo_l”模型,模型需安装在代码当前目录下的.\models中。需要roop或其他支持pytorch、insightface、moviepy的环境。
pytorch安装请见我其他文章。
# cython: language_level = 3str
# -*- coding: utf-8 -*-
import os
import threading
import time
import numpy as np
import cv2
import torch
import insightface
from insightface.app.common import Face
from moviepy.config import get_setting
from moviepy.tools import subprocess_call
import argparse
from queue import Queue,Full,Empty
from collections import OrderedDict
from scipy.spatial import distance
import json
THREAD_LOCK=threading.Lock()
THREAD_SEMAPHORE = threading.Semaphore()
BIG_GPUMEM: bool = False #如果gpu内存>2G,可设置为True
ROUND_POINT: int = 3 #保留小数点位数
FFPEG_CLIP_LOWEST_FPS: int = 26 #最低输出视频帧数
gThatPath = os.path.dirname(__file__)
gOutput_Dir="Clip_Output"
gFaceApp: insightface.app.FaceAnalysis = None
gSrcFace: str = None
gInput_video: str = None
gOutput_video: str = None
gNo_audio: bool = False
gQueue_maxsize: int = 300
gFps_skip_interval: int = 1 # 跳帧间隔,1表示不跳帧,2表示跳帧2帧,以此类推
gReserve_temp_files: bool = False
gMust_have_face: bool = False
THREAD_LOCK=threading.Lock()
THREAD_SEMAPHORE = threading.Semaphore()
# 初始化InsightFace模型
def Init():
global gFaceApp, gThatPath
get_FaceApp()
def get_FaceApp():
global gFaceApp,BIG_GPUMEM
if not BIG_GPUMEM:
if gFaceApp is None:
with THREAD_LOCK:
gFaceApp = insightface.app.FaceAnalysis(name='buffalo_l', root=gThatPath, providers=['CUDAExecutionProvider'])
gFaceApp.prepare(ctx_id=0, det_size=(640, 640))
return gFaceApp
else:
newFaceApp = insightface.app.FaceAnalysis(name='buffalo_m', root=gThatPath, providers=['CUDAExecutionProvider'])
newFaceApp.prepare(ctx_id=0, det_size=(640, 640))
return newFaceApp
def is_same_face_cosine(face1: Face, face2: Face, threshold=0.65):
similarity=distance.cosine(face1.normed_embedding, face2.normed_embedding)
return similarity <= threshold
def is_same_face(face1: Face, face2: Face, threshold=0.6):
# if hasattr(face1, 'normed_embedding') and hasattr(face2, 'normed_embedding'):
distance = np.sum(np.square(face1.normed_embedding - face2.normed_embedding))
if distance < threshold:
return True
return False
def process_frame(frame, reference_face):
faces = get_FaceApp().get(frame)
if faces:
for face in faces:
if is_same_face_cosine(face, reference_face):
return True
return False
def ffmpeg_merge_video(concat_list_path: str, output_video_path:str, audio:bool=True):
# '''
# ffmpeg -y -f concat -safe 0 -i "{concat_list_path}" -c copy -map 0:v -map 0:a
# -fflags +genpts "{temp_video_path}"'
# '''
if audio:
cmd = [get_setting("FFMPEG_BINARY"), "-y","-f","concat","-safe","0",
"-i", concat_list_path,
"-c","copy", "-map", "0:v", "-map", "0:a","-fflags", "+genpts",
output_video_path]
else:
cmd = [get_setting("FFMPEG_BINARY"), "-y","-f","concat","-safe","0",
"-i", concat_list_path,
"-c","copy", "-map", "0:v", "-fflags", "+genpts",
output_video_path]
subprocess_call(cmd)
def ffmpeg_extract_subclip(filename, t1, t2, targetname=None):
""" Makes a new video file playing video file ``filename`` between
the times ``t1`` and ``t2``. """
name, ext = os.path.splitext(filename)
if not targetname:
T1, T2 = [int(1000*t) for t in [t1, t2]]
targetname = "%sSUB%d_%d.%s" % (name, T1, T2, ext)
cmd = [get_setting("FFMPEG_BINARY"),"-y",
"-ss", "%0.3f"%t1,
"-i", filename,
"-t", "%0.3f"%(t2-t1),
"-map", "0", "-vcodec", "copy", "-acodec", "copy", targetname]
subprocess_call(cmd)
def save_json(data, json_path):
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
#将一个队列保存为json文件
def save_queue_json(data_queue:Queue,json_path:str)->Queue:
data_to_save=[]
while not data_queue.empty():
frame_count, result = data_queue.get()
data_to_save.append({"frame_count":frame_count,"result":result})
data_queue.task_done()
save_json(data_to_save, os.path.join(json_path,"reference_result.json"))
for item in data_to_save:
data_queue.put((item["frame_count"], item["result"]))
return data_queue
def extract_clip(input_video_path: str, start_time:float, end_time:float, temp_clip_path:str):
print(f"{input_video_path} Extracting clip from {start_time:.3f} to {end_time:.3f}")
try:
ffmpeg_extract_subclip(input_video_path, start_time, end_time, targetname=temp_clip_path)
except Exception as e:
print(f"Error extracting clip: {e}")
def ffmpeg_extractAndMergeClips(input_video_path:str, output_video_path:str,fps:float, face_timestamps:list,audio:bool=True):
global gFps_skip_interval
start_time:float = 0.0
end_time:float = 0.0
output_vidio_dir:str = os.path.dirname(output_video_path)
clips:list = []
for i, (start_time, end_time) in enumerate(face_timestamps):
temp_clip_path:str=os.path.join(output_vidio_dir , f"temp_clip_{i}.mp4")
extract_clip(input_video_path, start_time, end_time, temp_clip_path)
clips.append(temp_clip_path)
# Concatenate video clips using ffmpeg
concat_list_path:str = os.path.join(output_vidio_dir , "concat_list.txt")
with open(concat_list_path, 'w') as f:
for clip in clips:
f.write(f"file '{os.path.abspath(clip)}'\n")
temp_video_path:str = os.path.join(output_vidio_dir , "temp_video.mp4")
ffmpeg_merge_video(concat_list_path, temp_video_path, audio=audio)
try:
os.rename(temp_video_path, output_video_path)
except Exception as e:
print(f"Error renaming video: {e}")
if not gReserve_temp_files:
for clip in clips:
try:
os.remove(clip)
except Exception as e:
print(f"Error removing clip: {e}")
def genFaceTimestamps(result_queue:Queue, fps:float,fps_skip_interval:int)->list:
# global gClip_interval
face_timestamps:list = []
frame_count:int = 0
result:bool = False
secPerOneFrame:float=round(1/fps,ROUND_POINT)
clip_interval:float=round(FFPEG_CLIP_LOWEST_FPS/fps,ROUND_POINT)
while not result_queue.empty():
frame_count, result = result_queue.get()
if result:
current_time:float = round(frame_count / fps, ROUND_POINT)
if not face_timestamps or current_time - face_timestamps[-1][1] > clip_interval*2: # 间隔时间大于2倍间隔
face_timestamps.append([current_time, current_time])
else:
face_timestamps[-1][1] = current_time
result_queue.task_done()
i:int=0
while i<=len(face_timestamps):
if i<=len(face_timestamps)-1:
if face_timestamps[i][1] == face_timestamps[i][0]:
face_timestamps[i][1]+=secPerOneFrame*FFPEG_CLIP_LOWEST_FPS
i=i+1
continue
# if i <len(face_timestamps)-1:
# if face_timestamps[i][1] - face_timestamps[i+1][0] < clip_interval:# 间隔时间小于指定间隔
# face_timestamps[i][1] = face_timestamps[i+1][1]
# face_timestamps.pop(i+1)
i+=1
return face_timestamps
def worker(frame_queue, result_dict, reference_face):
while True:
frame, frame_count = frame_queue.get()
if frame is None:
break
result = process_frame(frame, reference_face)
# print(f"Frame {frame_count} processed.")
with THREAD_SEMAPHORE:
result_dict[frame_count] = result
frame_queue.task_done()
def get_faceFramesToQueue(worker:callable,input_video_path:str,ref_face:Face,num_threads:int=4)->Queue:
global gFaceApp, gQueue_maxsize,gFps_skip_interval
try:
cap: cv2.VideoCapture = cv2.VideoCapture(input_video_path)
fps: float = round(cap.get(cv2.CAP_PROP_FPS),ROUND_POINT)
total_frames: int = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Video: {input_video_path}\nFrames FPS: {fps}, Total frames: {total_frames}")
frame_count: int = 0
frame_skip: int = gFps_skip_interval if gFps_skip_interval >0 else 1
frame_queue: Queue = Queue(maxsize=gQueue_maxsize)
result_dict: OrderedDict =OrderedDict()
result_queue: Queue = Queue()
threads: list = []
for _ in range(num_threads):
t: threading.Thread = threading.Thread(target=worker, args=(frame_queue, result_dict, ref_face))
t.start()
threads.append(t)
while frame_count<total_frames:
ret, frame = cap.read()
if not ret:
break
frame_queue.put((frame, frame_count))
frame_count += frame_skip
frame_count = frame_count if frame_count < total_frames else total_frames
if frame_skip>1:
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
print(f"Processing frame {frame_count}/{total_frames}",end="\r")
for _ in range(num_threads):
frame_queue.put((None, None))
for t in threads:
t.join()
for frame_count in sorted(result_dict.keys()):
result_queue.put((frame_count, result_dict[frame_count]))
return fps,result_queue
finally:
cap.release()
gFaceApp=None
torch.cuda.empty_cache()
frame_queue=None
def clipVideoForFace(srcFace: str, input_video: str, output_video: str, audio:bool=True):
global gFaceApp,gQueue_maxsize
try:
reference_image: np.ndarray = cv2.imread(srcFace)
reference_faces: list = get_FaceApp().get(reference_image)
if len(reference_faces) == 0:
raise ValueError("Reference image does not contain any faces.")
reference_face: Face = reference_faces[0]
print(f"{srcFace} Reference face loaded.")
except:
print("Error loading reference face.")
return
fps,result_queue = get_faceFramesToQueue(worker,input_video,reference_face,num_threads=4)
output_vidio_dir = os.path.dirname(output_video)
os.makedirs(output_vidio_dir, exist_ok=True)
print(f"Output video directory: {output_vidio_dir}")
save_queue_json(result_queue,output_vidio_dir)
if gMust_have_face and not result_queue.empty():
cv2_extractAndMergeClips(input_video, output_video, result_queue)
else:
face_timestamps = genFaceTimestamps(result_queue, fps, gFps_skip_interval)
if not face_timestamps:
print(f"File {input_video} does not contain any face timestamps found.")
return
with open(os.path.join(output_vidio_dir,"face_timestamps.txt"), 'w') as f:
for start_time, end_time in face_timestamps:
f.write(f"{start_time:.3f} {end_time:.3f}\n")
print(f'\n{len(face_timestamps)} face timestamps found.')
ffmpeg_extractAndMergeClips(input_video, output_video, fps,face_timestamps, audio=audio)#addtime=round(gFps_skip_interval/fps,3))
print(f"Output video saved to {output_video}.")
_ProcessQueueDone=(None,None)
class SaveQueueToVidso(threading.Thread):
def __init__(self,fps:float=None,frame_width:int=None,frame_height:int=None):
super().__init__()
self.queue=Queue()
self.fps=fps
self.frame_width=frame_width
self.frame_height=frame_height
def set_args(self,processFrames:list,output_video_path:str):
with THREAD_SEMAPHORE:
self.queue.put((processFrames,output_video_path))
def run(self):
while True:
processFrames,out_path=self.queue.get()
if processFrames is None or len(processFrames)==0:
break
fourcc=cv2.VideoWriter_fourcc(*'mp4v')
out=cv2.VideoWriter(out_path, fourcc, self.fps, (self.frame_width, self.frame_height))
for i in range(len(processFrames)):
frame = processFrames[i]
if frame is None:
break
out.write(frame)
self.queue.task_done()
processFrames=None
out.release()
print("SaveQueueToVidso thread exit.")
#利用cv2将一个包含视频帧的队列转换成视频文件
def queueToVideo(queue:Queue,output_video_path:str,fps:float,frame_width:int,frame_height:int):
global gQueue_maxsize
try:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
while not queue.empty():
frame = queue.get()
if frame is None:
break
out.write(frame)
queue.task_done()
out.release()
except Exception as e:
print(f"Error writing video: {e}")
#根据给定的字典,依据字典的键值对,提取视频对应的帧,每提取1024帧,就合并成一个视频,
#并保存到指定目录,最后合并成1个视频
def cv2_extractAndMergeClips(input_video_path:str, output_video_path:str,result_dict:Queue):
global gFps_skip_interval
tempFileNames:list[str]=[]
output_vidio_dir:str = os.path.dirname(output_video_path)
frame_count:int = 0
lstFrame:Queue=Queue(1024)
#初始化视频读取器
try:
cap: cv2.VideoCapture = cv2.VideoCapture(input_video_path)
fps: float = round(cap.get(cv2.CAP_PROP_FPS),ROUND_POINT)
frame_width: int = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height: int = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames: int = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
sqtv:SaveQueueToVidso=SaveQueueToVidso(fps=fps,frame_width=frame_width,frame_height=frame_height)
sqtv.start()
#枚举字典的键值对,提取视频对应的帧
# for index,(frameNum,result) in enumerate(result_dict.items()):
while not result_dict.empty():
frameNum,result = result_dict.get()
if result:
for i in range(int(gFps_skip_interval)):
cap.set(cv2.CAP_PROP_POS_FRAMES, frameNum+i)
ret, frame = cap.read()
if not ret:
break
while True:
try:
lstFrame.put(frame,block=False)
frame_count+=1
print(f'Processing frame {frameNum+i}/{total_frames}',end='\r')
break
except Full:
temp_video_path:str = os.path.join(output_vidio_dir , f"temp_video_{frame_count}.mp4")
procesFrames=list(lstFrame.queue)
lstFrame.queue.clear()
sqtv.set_args(procesFrames,temp_video_path)
tempFileNames.append(temp_video_path)
if not lstFrame.empty():
#将lstFrame中的帧合并成一个视频
temp_video_path:str = os.path.join(output_vidio_dir , f"temp_video_{frame_count}.mp4")
procesFrames=list(lstFrame.queue)
lstFrame.queue.clear()
sqtv.set_args(procesFrames,temp_video_path)
tempFileNames.append(temp_video_path)
#清空lstFrame
#将tempFileNames中的视频合并成1个视频
sqtv.queue.put(_ProcessQueueDone)
sqtv.join()
if len(tempFileNames)>1:
concat_list_path:str = os.path.join(output_vidio_dir , "concat_list.txt")
with open(concat_list_path, 'w') as f:
for temp_video_path in tempFileNames:
f.write(f"file '{os.path.abspath(temp_video_path)}'\n")
temp_video_path:str = os.path.join(output_vidio_dir , "temp_video_merged.mp4")
ffmpeg_merge_video(concat_list_path, temp_video_path, audio=False)
os.rename(temp_video_path, output_video_path)
if not gReserve_temp_files:
for temp_video_path in tempFileNames:
os.remove(temp_video_path)
else:
os.rename(tempFileNames[0], output_video_path)
print(f"Output video saved to {output_video_path}.")
except Exception as e:
print(f"Error processing video: {e}")
finally:
cap.release()
def argParse():
global gSrcFace, gInput_video, gOutput_video, gNo_audio,gFps_skip_interval,\
gReserve_temp_files,BIG_GPUMEM,gMust_have_face#,gClip_interval
parser = argparse.ArgumentParser()
parser.add_argument("srcFace", help="Path to reference face image.")
parser.add_argument("input_video", help="Path to input video.")
parser.add_argument("--must_has_face",default=False,action="store_true",help="Must have face in frame.")
parser.add_argument("-o","--output_video", default=None, help="Path to output video.")
parser.add_argument("--no_audio", default=False, action="store_true", help="Disable audio in output video.")
parser.add_argument("--big_gpumem", default=False, action="store_true", help="Default is False. If you GPU memory >2G,Set it.")
# parser.add_argument("--clip_interval", default=0.2, type=float, help="Interval between clips in seconds.")
parser.add_argument("--reserve_temp_files", default=False, action="store_true", help="Keep temporary files after merging.")
parser.add_argument("--fps_skip_interval", default=2, type=int, help="Interval between frames to skip in seconds.")
args = parser.parse_args()
gSrcFace = args.srcFace
gInput_video = args.input_video
gOutput_video = args.output_video if args.output_video else \
os.path.join(os.path.dirname(gInput_video) ,
f"{gOutput_Dir}\\clip_output_{os.path.splitext(os.path.basename(gInput_video))[0]}.mp4")
gNo_audio = args.no_audio
BIG_GPUMEM = args.big_gpumem
# gClip_interval = args.clip_interval
gReserve_temp_files = args.reserve_temp_files
gFps_skip_interval = args.fps_skip_interval
gMust_have_face = args.must_has_face
def main():
global gSrcFace, gInput_video, gOutput_video, gNo_audio
try:
argParse()
except:
print("Error parsing arguments.")
exit(1)
startTime=time.time()
Init()
clipVideoForFace(gSrcFace, gInput_video, gOutput_video, not gNo_audio)
endTime=time.time()
print(f"Total time taken: {endTime-startTime:.2f} seconds.")
VERSION="0.5.0"
if __name__ == '__main__':
main()
基本应用:
1、指定srcFace,用于识别视频中的人脸,可利用视频截屏生成。
2、指定input_video,用于的最指定人脸的视频
3、--fps_skip_interval:用于指定提取视频帧时的步进间隔,最小为1,默认为2,建议用4,即每4帧进行一次识别,此数据越大,速度越快,但是识别精度就越低。10分钟的视频,用此项设为4,处理完成约7分钟左右。
4、--must_has_face:是确保所有帧包含指定人脸的方式,但因为是用提取的视频帧直接生成,所以没有音频。
5、FFPEG_CLIP_LOWEST_FPS: int = 26,这个是ffmpeg提取视频帧时,提取的最少帧数,所以即使指定must_has_face,也可能包含25个不含指定人脸的帧。
6、代码仅用于参考。