视频检索

最新推荐文章于 2024-06-26 10:09:05 发布

沙雅云

最新推荐文章于 2024-06-26 10:09:05 发布

阅读量2.2k

点赞数 1

分类专栏：视频中的目标检测

本文链接：https://blog.csdn.net/yychentracy/article/details/103304930

版权

视频中的目标检测专栏收录该内容

9 篇文章 0 订阅

订阅专栏

视频检索
本文采用了三种方法进行视频检索
1 根据图像检索视频关键帧的算法
2 采用vedioSearch的方式进行检索
3 采用yolo视频目标检测的算法进行检索
下面针对三种方法进行详细说明

根据图像检索视频关键帧的算法

step1:提取新闻片头关键帧
step2：提取整个新闻的关键帧
step3：定义图像的相似度匹配算法
step4：根据相似的匹配算法匹配关键帧
代码如下：

#关键帧提取算法
# -*- coding: utf-8 -*-
"""
Created on Tue Dec  4 16:48:57 2018
keyframes extract tool
this key frame extract algorithm is based on interframe difference.
The principle is very simple
First, we load the video and compute the interframe difference between each frames
Then, we can choose one of these three methods to extract keyframes, which are 
all based on the difference method:
    
1. use the difference order
    The first few frames with the largest average interframe difference 
    are considered to be key frames.
2. use the difference threshold
    The frames which the average interframe difference are large than the 
    threshold are considered to be key frames.
3. use local maximum
    The frames which the average interframe difference are local maximum are 
    considered to be key frames.
    It should be noted that smoothing the average difference value before 
    calculating the local maximum can effectively remove noise to avoid 
    repeated extraction of frames of similar scenes.
After a few experiment, the third method has a better key frame extraction effect.
The original code comes from the link below, I optimized the code to reduce 
unnecessary memory consumption.
https://blog.csdn.net/qq_21997625/article/details/81285096
@author: zyb_as
""" 
import cv2
import operator
import numpy as np
import matplotlib.pyplot as plt
import sys
from scipy.signal import argrelextrema

 
def smooth(x, window_len=13, window='hanning'):
    """smooth the data using a window with requested size.
    
    This method is based on the convolution of a scaled window with the signal.
    The signal is prepared by introducing reflected copies of the signal 
    (with the window size) in both ends so that transient parts are minimized
    in the begining and end part of the output signal.
    
    input:
        x: the input signal 
        window_len: the dimension of the smoothing window
        window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
            flat window will produce a moving average smoothing.
    output:
        the smoothed signal
        
    example:
    import numpy as np    
    t = np.linspace(-2,2,0.1)
    x = np.sin(t)+np.random.randn(len(t))*0.1
    y = smooth(x)
    
    see also: 
    
    numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve
    scipy.signal.lfilter
 
    TODO: the window parameter could be the window itself if an array instead of a string   
    """
    print(len(x), window_len)
    # if x.ndim != 1:
    #     raise ValueError, "smooth only accepts 1 dimension arrays."
    #
    # if x.size < window_len:
    #     raise ValueError, "Input vector needs to be bigger than window size."
    #
    # if window_len < 3:
    #     return x
    #
    # if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
    #     raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
 
    s = np.r_[2 * x[0] - x[window_len:1:-1],
              x, 2 * x[-1] - x[-1:-window_len:-1]]
    #print(len(s))
 
    if window == 'flat':  # moving average
        w = np.ones(window_len, 'd')
    else:
        w = getattr(np, window)(window_len)
    y = np.convolve(w / w.sum(), s, mode='same')
    return y[window_len - 1:-window_len + 1]
 

class Frame:
    """class to hold information about each frame
    
    """
    def __init__(self, id, diff):
        self.id = id
        self.diff = diff
 
    def __lt__(self, other):
        if self.id == other.id:
            return self.id < other.id
        return self.id < other.id
 
    def __gt__(self, other):
        return other.__lt__(self)
 
    def __eq__(self, other):
        return self.id == other.id and self.id == other.id
 
    def __ne__(self, other):
        return not self.__eq__(other)
 
 
def rel_change(a, b):
   x = (b - a) / max(a, b)
   print(x)
   return x
 
    
if __name__ == "__main__":
    print(sys.executable)
    #Setting fixed threshold criteria
    USE_THRESH = False
    #fixed threshold value
    THRESH = 0.6
    #Setting fixed threshold criteria
    USE_TOP_ORDER = False
    #Setting local maxima criteria
    USE_LOCAL_MAXIMA = True
    #Number of top sorted frames
    NUM_TOP_FRAMES = 50
     
    #Video path of the source file
    videopath = 'myvedio.flv'
    #Directory to store the processed frames
    dir = './myvedio_extract_result/'
    #smoothing window size
    len_window = int(50)
    
    
    print("target video :" + videopath)
    print("frame save directory: " + dir)
    # load video and compute diff between frames
    cap = cv2.VideoCapture(str(videopath)) 
    curr_frame = None
    prev_frame = None 
    frame_diffs = []
    frames = []
    success, frame = cap.read()
    i = 0 
    while(success):
        luv = cv2.cvtColor(frame, cv2.COLOR_BGR2LUV)
        curr_frame = luv
        if curr_frame is not None and prev_frame is not None:
            #logic here
            diff = cv2.absdiff(curr_frame, prev_frame)
            diff_sum = np.sum(diff)
            diff_sum_mean = diff_sum / (diff.shape[0] * diff.shape[1])
            frame_diffs.append(diff_sum_mean)
            frame = Frame(i, diff_sum_mean)
            frames.append(frame)
        prev_frame = curr_frame
        i = i + 1
        success, frame = cap.read()   
    cap.release()
    
    # compute keyframe
    keyframe_id_set = set()
    if USE_TOP_ORDER:
        # sort the list in descending order
        frames.sort(key=operator.attrgetter("diff"), reverse=True)
        for keyframe in frames[:NUM_TOP_FRAMES]:
            keyframe_id_set.add(keyframe.id) 
    if USE_THRESH:
        print("Using Threshold")
        for i in range(1, len(frames)):
            if (rel_change(np.float(frames[i - 1].diff), np.float(frames[i].diff)) >= THRESH):
                keyframe_id_set.add(frames[i].id)   
    if USE_LOCAL_MAXIMA:
        print("Using Local Maxima")
        diff_array = np.array(frame_diffs)
        sm_diff_array = smooth(diff_array, len_window)
        frame_indexes = np.asarray(argrelextrema(sm_diff_array, np.greater))[0]
        for i in frame_indexes:
            keyframe_id_set.add(frames[i - 1].id)
            
        plt.figure(figsize=(40, 20))
        plt.locator_params(numticks=100)
        plt.stem(sm_diff_array)
        plt.savefig(dir + 'plot.png')匹配
    
    # save all keyframes as image
    cap = cv2.VideoCapture(str(videopath))
    curr_frame = None
    keyframes = []
    success, frame = cap.read()
    idx = 0
    while(success):
        if idx in keyframe_id_set:
            name = "keyframe_" + str(idx) + ".jpg"
            cv2.imwrite(dir + name, frame)
            keyframe_id_set.remove(idx)
        idx = idx + 1
        success马, frame = cap.read()
    cap.release()

计算图片的相似度

#计算图片的相似度距离
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# @Time    : 2018/11/17 14:52
# @Author  : xhh
# @Desc    : 余弦相似度计算
# @File    : difference_image_consin.py
# @Software: PyCharm
from PIL import Image
from numpy import average, dot, linalg
 
# 对图片进行统一化处理
def get_thum(image, size=(64,64), greyscale=False):
    # 利用image对图像大小重新设置, Image.ANTIALIAS为高质量的
    image = image.resize(size, Image.ANTIALIAS)
    if greyscale:
        # 将图片转换为L模式，其为灰度图，其每个像素用8个bit表示
        image = image.convert('L')
    return image
 
# 计算图片的余弦距离
def image_similarity_vectors_via_numpy(image1, image2):
    image1 = get_thum(image1)
    image2 = get_thum(image2)
    images = [image1, image2]
    vectors = []
    norms = []
    for image in images:
        vector = []
        for pixel_tuple in image.getdata():
            vector.append(average(pixel_tuple))
        vectors.append(vector)
        # linalg=linear（线性）+algebra（代数），norm则表示范数
        # 求图片的范数？？
        norms.append(linalg.norm(vector, 2))
    a, b = vectors
    a_norm, b_norm = norms
    # dot返回的是点积，对二维数组（矩阵）进行计算
    res = dot(a / a_norm, b / b_norm)
    return res
 
#'''
#image1 = Image.open('images/1.jpeg')
#image2 = Image.open('myimage64.jpg')
#cosin = image_similarity_vectors_via_numpy(image1, image2)
#print('图片余弦相似度',cosin)

关键帧匹配算法

image1=Image.open('11.jpg')
import os
cosin1=[]
imagematch=[]
for filename in os.listdir(r"./myvedio_extract_result"):              #listdir的参数是文件夹的路径
    #image2 = cv2.imread(filename)   #此时的filename是文件夹中文件的名称
    image2=Image.open(os.path.join('myvedio_extract_result',filename))
    cosin = image_similarity_vectors_via_numpy(image1, image2) 
    cosin1.append(cosin)
    imagematch.append(filename)
print(max(cosin1))
index=cosin1.index(max(cosin1))
print(index)
print(imagematch[index])
  #train_dir +"/" + train_image_names[0]

这是最后的结果
1.0匹配的精度为1
258 第258个关键帧得到匹配
keyframe_56.jpg 第56帧被匹配

采用vedioSearch的方式进行检索

开源视频检索技术VedioSearch
https://blog.csdn.net/meloyi/article/details/53034823
https://github.com/andrefaraujo/videosearch
本项目主要做的事情是：
1 提取视频的关键帧（关键帧，其实就是视频中的一张图像）。对视频进行镜头边缘检测。
2 对图片或者帧提取SIFT（尺度不变特征变换，用于在图像中检测出关键点，是一种局部特征描述子）。
3 为每张图片/关键帧、镜头或者视频片段提取全局描述子（Fisher Vectors）。
4 使用Bloom Filters对每个视频片段进行索引。
5 使用图片对图片或者视频数据库进行检索。
6 用区间为0~1的平均精准度和精准度来评估检索结果。

本项目代码可以优化，得到最后的视频匹配效果
根据代码参考资料进行视频检索，得到一个很好的匹配度，但是不能够有实时的匹配

采用yolo视频目标检测的算法进行检索

修改yolov3目标检测到视频的目标检测
step1 安装cuda，cuddn，darkNet
step2 运行darknet 图像的目标检测
step3 修改darknet 可以运行视频的目标检测
详细步骤如下：

下载代码：

git clone https://github.com/pjreddie/darknet

编译代码

cd darknet
make

下载权重文件

wget https://pjreddie.com/media/files/yolov3.weights

修改代码运行

我们首先需要将“darknet”文件夹内的“libdarknet.so”文件移动到“darknet/python”内
打开“darknet/cfg/coco.data”文件，将“names”也改为绝对路径：
进入“darknet/python”然后执行“darknet.py”文件即可

修改代码进行视频的目标检测

1.修改src/image.c

#ifdef NUMPY
image ndarray_to_image(unsigned char* src, long* shape, long* strides)
{
    int h = shape[0];
    int w = shape[1];
    int c = shape[2];
    int step_h = strides[0];
    int step_w = strides[1];
    int step_c = strides[2];
    image im = make_image(w, h, c);
    int i, j, k;
    int index1, index2 = 0;
 
    for(i = 0; i < h; ++i){
            for(k= 0; k < c; ++k){
                for(j = 0; j < w; ++j){
 
                    index1 = k*w*h + i*w + j;
                    index2 = step_h*i + step_w*j + step_c*k;
                    //fprintf(stderr, "w=%d h=%d c=%d step_w=%d step_h=%d step_c=%d \n", w, h, c, step_w, step_h, step_c);
                    //fprintf(stderr, "im.data[%d]=%u data[%d]=%f \n", index1, src[index2], index2, src[index2]/255.);
                    im.data[index1] = src[index2]/255.;
                }
            }
        }
 
    rgbgr_image(im);
 
    return im;
}
#endif

2.然后在src/image.h大概22行插入：

#ifdef NUMPY
image ndarray_to_image(unsigned char* src, long* shape, long* strides);
#endif

3.修改Makefile文件：

GPU=1
CUDNN=1
OPENCV=1
# 添加
NUMPY=1
OPENMP=1
DEBUG=1
 
ARCH= -gencode arch=compute_30,code=sm_30 \
      -gencode arch=compute_35,code=sm_35 \
      -gencode arch=compute_50,code=[sm_50,compute_50] \
      -gencode arch=compute_52,code=[sm_52,compute_52] \
	  -gencode arch=compute_70,code=[sm_70,compute_70] \
	  -gencode arch=compute_75,code=[sm_75,compute_75]
#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?
 
# This is what I use, uncomment if you know your arch and want to specify
# ARCH= -gencode arch=compute_52,code=compute_52
 
VPATH=./src/:./examples
SLIB=libdarknet.so
ALIB=libdarknet.a
EXEC=darknet
OBJDIR=./obj/
 
CC=gcc
CPP=g++
NVCC=nvcc 
AR=ar
ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread 
COMMON= -Iinclude/ -Isrc/
CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC
 
ifeq ($(OPENMP), 1) 
CFLAGS+= -fopenmp
endif
 
ifeq ($(DEBUG), 1) 
OPTS=-O0 -g
endif
 
CFLAGS+=$(OPTS)
 
ifeq ($(OPENCV), 1) 
COMMON+= -DOPENCV
CFLAGS+= -DOPENCV
LDFLAGS+= `pkg-config --libs opencv` -lstdc++
COMMON+= `pkg-config --cflags opencv` 
endif
# 添加
ifeq ($(NUMPY), 1) 
COMMON+= -DNUMPY -I/home/sbs/anaconda3/envs/tracy/include/python3.6m/ -I/home/sbs/anaconda3/envs/tracy/lib/python3.6/site-packages/numpy/core/include/numpy/
CFLAGS+= -DNUMPY
endif
 
ifeq ($(GPU), 1) 
COMMON+= -DGPU -I/usr/local/cuda-10.0-cudnn-7.3.1/include/
CFLAGS+= -DGPU
LDFLAGS+= -L/usr/local/cuda-10.0-cudnn-7.3.1/lib64 -lcuda -lcudart -lcublas -lcurand
endif
 
ifeq ($(CUDNN), 1) 
COMMON+= -DCUDNN 
CFLAGS+= -DCUDNN
LDFLAGS+= -lcudnn
endif
 
OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o image_opencv.o
EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o instance-segmenter.o darknet.o
ifeq ($(GPU), 1) 
LDFLAGS+= -lstdc++ 
OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o avgpool_layer_kernels.o
endif
 
EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA))
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile include/darknet.h
 
all: obj backup results $(SLIB) $(ALIB) $(EXEC)
#all: obj  results $(SLIB) $(ALIB) $(EXEC)
 
 
$(EXEC): $(EXECOBJ) $(ALIB)
	$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)
 
$(ALIB): $(OBJS)
	$(AR) $(ARFLAGS) $@ $^
 
$(SLIB): $(OBJS)
	$(CC) $(CFLAGS) -shared $^ -o $@ $(LDFLAGS)
 
$(OBJDIR)%.o: %.cpp $(DEPS)
	$(CPP) $(COMMON) $(CFLAGS) -c $< -o $@
 
$(OBJDIR)%.o: %.c $(DEPS)
	$(CC) $(COMMON) $(CFLAGS) -c $< -o $@
 
$(OBJDIR)%.o: %.cu $(DEPS)
	$(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
 
obj:
	mkdir -p obj
backup:
	mkdir -p backup
results:
	mkdir -p results
 
.PHONY: clean
 
clean:
	rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ) $(OBJDIR)/*

4.然后make clean再重新make -j8就可以
5.修改python/darknet.py文件，这里我直接贴完整的程序了

from ctypes import *
import math
import random
import time
import numpy as np
import cv2
import os
import sys
 
def sample(probs):
    s = sum(probs)
    probs = [a/s for a in probs]
    r = random.uniform(0, 1)
    for i in range(len(probs)):
        r = r - probs[i]
        if r <= 0:
            return i
    return len(probs)-1
 
def c_array(ctype, values):
    arr = (ctype*len(values))()
    arr[:] = values
    return arr
 
class BOX(Structure):
    _fields_ = [("x", c_float),
                ("y", c_float),
                ("w", c_float),
                ("h", c_float)]
 
class DETECTION(Structure):
    _fields_ = [("bbox", BOX),
                ("classes", c_int),
                ("prob", POINTER(c_float)),
                ("mask", POINTER(c_float)),
                ("objectness", c_float),
                ("sort_class", c_int)]
 
 
class IMAGE(Structure):
    _fields_ = [("w", c_int),
                ("h", c_int),
                ("c", c_int),
                ("data", POINTER(c_float))]
 
class METADATA(Structure):
    _fields_ = [("classes", c_int),
                ("names", POINTER(c_char_p))]
 
    
 
#lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
lib = CDLL("../libdarknet.so", RTLD_GLOBAL)
lib.network_width.argtypes = [c_void_p]
lib.network_width.restype = c_int
lib.network_height.argtypes = [c_void_p]
lib.network_height.restype = c_int
 
predict = lib.network_predict
predict.argtypes = [c_void_p, POINTER(c_float)]
predict.restype = POINTER(c_float)
 
set_gpu = lib.cuda_set_device
set_gpu.argtypes = [c_int]
 
make_image = lib.make_image
make_image.argtypes = [c_int, c_int, c_int]
make_image.restype = IMAGE
 
get_network_boxes = lib.get_network_boxes
get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int)]
get_network_boxes.restype = POINTER(DETECTION)
 
make_network_boxes = lib.make_network_boxes
make_network_boxes.argtypes = [c_void_p]
make_network_boxes.restype = POINTER(DETECTION)
 
free_detections = lib.free_detections
free_detections.argtypes = [POINTER(DETECTION), c_int]
 
free_ptrs = lib.free_ptrs
free_ptrs.argtypes = [POINTER(c_void_p), c_int]
 
network_predict = lib.network_predict
network_predict.argtypes = [c_void_p, POINTER(c_float)]
 
reset_rnn = lib.reset_rnn
reset_rnn.argtypes = [c_void_p]
 
load_net = lib.load_network
load_net.argtypes = [c_char_p, c_char_p, c_int]
load_net.restype = c_void_p
 
do_nms_obj = lib.do_nms_obj
do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
 
do_nms_sort = lib.do_nms_sort
do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
 
free_image = lib.free_image
free_image.argtypes = [IMAGE]
 
letterbox_image = lib.letterbox_image
letterbox_image.argtypes = [IMAGE, c_int, c_int]
letterbox_image.restype = IMAGE
 
load_meta = lib.get_metadata
lib.get_metadata.argtypes = [c_char_p]
lib.get_metadata.restype = METADATA
 
load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE
 
# 添加以处理视频
ndarray_image = lib.ndarray_to_image
ndarray_image.argtypes = [POINTER(c_ubyte), POINTER(c_long), POINTER(c_long)]
ndarray_image.restype = IMAGE
 
rgbgr_image = lib.rgbgr_image
rgbgr_image.argtypes = [IMAGE]
 
predict_image = lib.network_predict_image
predict_image.argtypes = [c_void_p, IMAGE]
predict_image.restype = POINTER(c_float)
 
def classify(net, meta, im):
    out = predict_image(net, im)
    res = []
    for i in range(meta.classes):
        res.append((meta.names[i], out[i]))
    res = sorted(res, key=lambda x: -x[1])
    return res
 
"""
Yolo-v3目前耗时过长的步骤
    1.输入图像的预处理阶段
    2.python接口调用网络执行一次推理过程
"""
 
def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
    # preprocess_image_time = time.time()
    # 大约0.1131s
    im = load_image(image, 0, 0)
    # print("Yolo Preprocess image time in python version:", (time.time() - preprocess_image_time))
    num = c_int(0)
    pnum = pointer(num)
    # start_time = time.time()
    # 大概0.129秒左右
    predict_image(net, im)
    # print("Yolo Do inference time in python version:", (time.time() - start_time))
    
    # get_detection_time = time.time()
    # 大约0.0022s
    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
    # print("Yolo Get detections time in python version:", (time.time() - get_detection_time))
    num = pnum[0]
    # do_nms_time = time.time()
    # 可以忽略不计
    if (nms): do_nms_obj(dets, num, meta.classes, nms)
    # print("Yolo Do nms time in python version:", (time.time() - do_nms_time))
 
    res = []
    for j in range(num):
        for i in range(meta.classes):
            if dets[j].prob[i] > 0:
                b = dets[j].bbox
                res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
    res = sorted(res, key=lambda x: -x[1])
    free_image(im)
    free_detections(dets, num)
    return res
 
# 添加以处理视频
def detect_im(net, meta, im, thresh=.5, hier_thresh=.5, nms=.45):
    # to_image_time = time.time()
    # 大约0.0012~0.0013秒
    im, image = array_to_image(im)
    # print("to_image time:", (time.time() - to_image_time))
    # rgbgr_image_time = time.time()
    # 大约0.0013秒
    rgbgr_image(im)
    # print("rgbgr_image time:", (time.time() - rgbgr_image_time))
    num = c_int(0)
    pnum = pointer(num)
    # do_inference_time = time.time()
    # 大约0.083秒
    predict_image(net, im)
    # print("Do inference time:", (time.time() - do_inference_time))
    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
    num = pnum[0]
    if (nms): do_nms_obj(dets, num, meta.classes, nms)
 
    res = []
    for j in range(num):
        a = dets[j].prob[0:meta.classes]
        if any(a):
            ai = np.array(a).nonzero()[0]
            for i in ai:
                b = dets[j].bbox
                res.append((meta.names[i], dets[j].prob[i],
                           (b.x, b.y, b.w, b.h)))
 
    res = sorted(res, key=lambda x: -x[1])
    if isinstance(image, bytes):
        free_image(im)
    free_detections(dets, num)
 
    return res
 
def array_to_image(arr):
    # need to return old values to avoid python freeing memory
    arr = arr.transpose(2,0,1)
    c, h, w = arr.shape[0:3]
    arr = np.ascontiguousarray(arr.flat, dtype=np.float32) / 255.0
    data = arr.ctypes.data_as(POINTER(c_float))
    im = IMAGE(w,h,c,data)
    return im, arr
 
def get_folderImages(folder):
    all_files = os.listdir(folder)
    abs_path = [os.path.join(folder, i) for i in all_files]
    return abs_path
 
def convertBack(x, y, w, h):
    xmin = int(round(x - (w / 2)))
    xmax = int(round(x + (w / 2)))
    ymin = int(round(y - (h / 2)))
    ymax = int(round(y + (h / 2)))
    return xmin, ymin, xmax, ymax
 
def init():
    net = load_net("../cfg/yolov3.cfg".encode("utf-8"), "../cfg/yolov3.weights".encode("utf-8"), 0)
    meta = load_meta("../cfg/coco.data".encode("utf-8"))
    return net, meta
 
def image_processing():
    net, meta = init()
 
    folder = "images"
    save_folder = "results"
    each_process_time = []
 
    for image_path in get_folderImages(folder):
        image = cv2.imread(image_path)
        start_time = time.time()
        r = detect(net, meta, image_path.encode("utf-8"))
        processing_time = time.time() - start_time
        each_process_time.append(processing_time)
        for i in range(len(r)):
            x, y, w, h = r[i][2][0], r[i][2][1], r[i][2][2], r[i][2][3]
            topleft, topright, bottomleft, bottomright = convertBack(float(x), float(y), float(w), float(h))
            result = cv2.rectangle(
                image,
                (topleft, topright),
                (bottomleft, bottomright),
                (0, 255, 255),
                2
            )
            cv2.putText(
                result, 
                bytes.decode(r[i][0]), 
                (topleft, topright),
                cv2.FONT_HERSHEY_SIMPLEX, 
                1.0, 
                (0, 0, 255), 
                2
            )
        save_path = os.path.join(save_folder, image_path.split('/')[-1].split(".jpg")[0] + "-result.jpg")
        cv2.imwrite(save_path, result)
    average_processing_time = np.mean(each_process_time)
    print("Yolo-v3 COCO Average each Image processing Time:\n")
    print(average_processing_time)
 
def video_processing():
    set_gpu(7)
    net, meta = init()
 
    processing_path = "small.mp4"
    cam = cv2.VideoCapture(processing_path)
    total_frames = cam.get(cv2.CAP_PROP_FRAME_COUNT)
    fps = cam.get(cv2.CAP_PROP_FPS)
    frame_size = (int(cam.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cam.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    # fourcc = int(cam.get(cv2.CAP_PROP_FOURCC))
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    processing_result_name = processing_path.split(".mp4")[0] + "-result.mp4"
    result = cv2.VideoWriter(processing_result_name, fourcc, fps, frame_size)
        
    timeF = 1
    c = 1
    print("opencv?", cam.isOpened())
    print("fps:", fps)
    print("decode style:", fourcc)
    print("size:", frame_size)
    print("total frames:", total_frames)
    start_total = time.time()
    while True:
        frame_start = time.time()
        _, img = cam.read()
        if (c % timeF == 0 or c == total_frames):
            if img is not None:
                r = detect_im(net, meta, img)
                for i in range(len(r)):
                    x, y, w, h = r[i][2][0], r[i][2][1], r[i][2][2], r[i][2][3]
                    topleft, topright, bottomleft, bottomright = convertBack(float(x), float(y), float(w), float(h))
                    img = cv2.rectangle(
                        img,
                        (topleft, topright),
                        (bottomleft, bottomright),
                        (0, 255, 255),
                        1
                    )
                    label_score = "{}:{:.2f}".format(bytes.decode(r[i][0]), r[i][1])
                    cv2.putText(
                        img, 
                        label_score, 
                        (topleft, topright),
                        cv2.FONT_HERSHEY_SIMPLEX, 
                        1.0, 
                        (0, 0, 255), 
                        1
                    )
                result.write(img)
        else:
            result.write(img)
 
        c += 1
 
        if c > total_frames:
            print("Finished Processing!")
            break
        print("processing one frame total time:", (time.time() - frame_start))
        print()
        
    processing_time = time.time() - start_total
    cam.release()
    result.release()
    post_compression(processing_result_name)
    print("Yolo-v3 COCO one Video Process Time:\n")
    print(processing_time)
 
if __name__ == "__main__":
    #net = load_net("cfg/densenet201.cfg", "/home/pjreddie/trained/densenet201.weights", 0)
    #im = load_image("data/wolf.jpg", 0, 0)
    #meta = load_meta("cfg/imagenet1k.data")
    #r = classify(net, meta, im)
    #print r[:10]
    # net = load_net("../cfg/yolov3.cfg".encode("utf-8"), "../cfg/yolov3.weights".encode("utf-8"), 0)
    # meta = load_meta("../cfg/coco.data".encode("utf-8"))
    # start_time = time.time()
    # r = detect(net, meta, "../data/car.jpg".encode("utf-8"))
    # print("Inference time:{:.4f}".format(time.time() - start_time))
    # print(r)
    image_processing()
    # video_processing()

运行
./darknet detector demo cfg/coco.data cfg/yolov3.cfg cfg/yolov3.weights python/videos/test.mp4

沙雅云

关注

1
点赞
踩
14

收藏

觉得还不错? 一键收藏
0
评论
视频检索

视频检索本文采用了三种方法进行视频检索1 根据图像检索视频关键帧的算法2 采用vedioSearch的方式进行检索3 采用yolo视频目标检测的算法进行检索下面针对三种方法进行详细说明根据图像检索视频关键帧的算法step1:提取新闻片头关键帧step2：提取整个新闻的关键帧step3：定义图像的相似度匹配算法step4：根据相似的匹配算法匹配关键帧代码如下：#关键帧提取算...
复制链接

扫一扫

专栏目录