temporal segment networks 核心代码

最新推荐文章于 2023-08-21 18:17:49 发布

Charel_CHEN

最新推荐文章于 2023-08-21 18:17:49 发布

阅读量2.1k

点赞数 4

分类专栏：深度学习与计算机视觉 caffe源代码

本文链接：https://blog.csdn.net/Charel_CHEN/article/details/81514749

版权

深度学习与计算机视觉同时被 2 个专栏收录

32 篇文章 0 订阅

订阅专栏

caffe源代码

13 篇文章 0 订阅

订阅专栏

temporal segment networks 核心代码

标签（空格分隔）：源码

temporal segment networks 核心代码

网络结构 prototxt

先来看看prototxt网络配置，以光流的为例，prototxt核心部分如下，这里只摘取了，数据输入层和损失函数层，中间层是BN-Inception结构（略）。

name: "BN-Inception"
"""
一个批次读入数据的大小为: batch_size, num_segment * new_length * modilty, image_w, image_h
其他的参量下面都有解释，这里说一下modilty，对于光流图像来说，它是灰度图像，但是它存在两个
方向的光流x, y；对于RGB来说，存在三个通道。
"""

layer {
  name: "data"
  type: "VideoData"
  top: "data"
  top: "label"
  video_data_param {
  #文本文件，每一行的是一个视频的信息：视频图像的路径 视频的帧数 视频的类别
    source: "data/ucf101_flow_train_split_1.txt"
    batch_size: 32
    new_length: 5
  #视频平均分成num_segments段，采样时从每一段采样连续new_length帧
    num_segments: 3
  #模态，RGB或者光流
    modality: FLOW
    shuffle: true
  #视频图像的命名方式%c取(x, y) %05d取第几帧
    name_pattern: "flow_%c_%05d.jpg"
  }
  transform_param{
    crop_size: 224
    mirror: true
    fix_crop: true
    more_fix_crop: true
    multi_scale: true
    max_distort: 1
    scale_ratios: [1,.875,.75]
    mean_value: 128
    is_flow: true
  }
  include: { phase: TRAIN }
}
layer {
  name: "data"
  type: "VideoData"
  top: "data"
  top: "label"
  video_data_param {
    source: "data/ucf101_flow_val_split_1.txt"
    batch_size: 1
    new_length: 5
    num_segments: 3
    modality: FLOW
    name_pattern: "flow_%c_%05d.jpg"
  }
  transform_param{
    crop_size: 224
    mirror: false
    mean_value: 128
    is_flow: true
  }
  include: { phase: TEST }
}
#通过数据读入层，以上面参数设定为例，输入数据大小为(32, 30, 224, 224)

#对输入数据做一个Reshap，变成(96, 10, 224, 224)
#其目的是，第二维度（通道数）变成从num_segments段中，采样一次得到的通道数
#new_length * modility，方便后续测试用（因为第二维度关系到权值大小的问题）
layer { name: "data_reshape" type: "Reshape" bottom: "data" top: "data_reshape" reshape_param { shape { dim: [-1, 10, 224, 224] } }}

"""
BN-Inception Network
"""

####################################### global pool #######################################
# 全局池化层，将feature map变成 1 * 1
layer { name: "global_pool" top: "global_pool" bottom: "inception_5b/output" type: "Pooling"
  pooling_param { pool: AVE kernel_size: 7 stride: 1 } }
layer { name: "dropout" top: "global_pool" bottom: "global_pool" type: "Dropout"
dropout_param { dropout_ratio: 0.7 } }

####################################### loss accuracy #######################################
# 将通道数变成101，进而分类， 此时输出数据为96 * 101
layer { name: "fc-action" type: "InnerProduct" bottom: "global_pool" top: "fc"
  param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 }
  inner_product_param { num_output: 101
    weight_filler { type: "gaussian" std: 0.001 }
    bias_filler { type: "constant" value: 0 } } }
#进行还原，实际上，网络的batch_size = 32, 该层的输出为：32,1,3,101
#其中第三维度3表示，一段视频分的段数num_segments
#也就是说，每一段视频采样的图像帧对该视频的类别进行打分，然后进行融合(mean, max等)，与论文描述一致
layer { name: "reshape_fc" bottom: "fc" top: "reshape_fc" type: "Reshape"
  reshape_param { shape { dim: [-1, 1, 3, 101 ] } }}
layer { name: "pool_fusion" bottom: "reshape_fc" top: "pool_fc" type: "Pooling"
  pooling_param { pool: AVE kernel_h: 3 kernel_w: 1} }
layer { name: "loss" type: "SoftmaxWithLoss" bottom: "pool_fc" bottom: "label" top: "loss" softmax_param {axis: 3} }
layer { name: "accuracy_top1" type: "Accuracy" bottom: "pool_fc" bottom: "label" top: "accuracy" accuracy_param {axis: 3}
  include { phase: TEST } }

视频图像的读入 video_data_layer

在TSN中，视频中的每一帧图像以及对应的光流都是事先生成，并存放在文件夹中，同时生成ucf101_flow_train_split_1.txt相关的文本文件，该文本文件每一行的格式：视频图像的路径该段视频的帧数该段视频的类别。
我们来看看video_data_layer的源码

#include <fstream>
#include <iostream>
#include <string>
#include <utility>
#include <vector>

#include "caffe/data_layers.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/io.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/rng.hpp"

#ifdef USE_MPI
#include "mpi.h"
#include <boost/filesystem.hpp>
using namespace boost::filesystem;
#endif

namespace caffe{
template <typename Dtype>
VideoDataLayer<Dtype>:: ~VideoDataLayer<Dtype>(){
    this->JoinPrefetchThread();
}

template <typename Dtype>
void VideoDataLayer<Dtype>:: DataLayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){
//读取prototxt配置文件的参数
    const int new_height  = this->layer_param_.video_data_param().new_height();
    const int new_width  = this->layer_param_.video_data_param().new_width();
    const int new_length  = this->layer_param_.video_data_param().new_length();
    const int num_segments = this->layer_param_.video_data_param().num_segments();
    const string& source = this->layer_param_.video_data_param().source();
//获取数据集相关的信息文件
    LOG(INFO) << "Opening file: " << source;
    打开该文本文件
    std:: ifstream infile(source.c_str());
    string filename;
    int label;
    int length;
    //数据分别流入，文件路径，视频长度，视频类别
    while (infile >> filename >> length >> label){
    //分别压入lines_和lines_duration_容器中
        lines_.push_back(std::make_pair(filename,label));
        lines_duration_.push_back(length);
    }
    //打乱视频
    if (this->layer_param_.video_data_param().shuffle()){
        const unsigned int prefectch_rng_seed = caffe_rng_rand();
        prefetch_rng_1_.reset(new Caffe::RNG(prefectch_rng_seed));
        prefetch_rng_2_.reset(new Caffe::RNG(prefectch_rng_seed));
        ShuffleVideos();
    }
//lines_容器大大小，也就表示训练集（测试集）中，视频的个数
    LOG(INFO) << "A total of " << lines_.size() << " videos.";
    lines_id_ = 0;

    //check name patter
    //图像命名的格式
    if (this->layer_param_.video_data_param().name_pattern() == ""){
        if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_RGB){
            name_pattern_ = "image_%04d.jpg";
        }else if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
            name_pattern_ = "flow_%c_%04d.jpg";
        }
    }else{
        name_pattern_ = this->layer_param_.video_data_param().name_pattern();
    }
//进行视频图像读入测试
    Datum datum;
        bool is_color = !this->layer_param_.video_data_param().grayscale();
    const unsigned int frame_prefectch_rng_seed = caffe_rng_rand();
    frame_prefetch_rng_.reset(new Caffe::RNG(frame_prefectch_rng_seed));
    //将视频分成num_segments段的平均间隔
    int average_duration = (int) lines_duration_[lines_id_]/num_segments;
    vector<int> offsets;//采样图像索引偏移
    for (int i = 0; i < num_segments; ++i){
        caffe::rng_t* frame_rng = static_cast<caffe::rng_t*>(frame_prefetch_rng_->generator());
        int offset = (*frame_rng)() % (average_duration - new_length + 1);
        offsets.push_back(offset+i*average_duration);//相对于第一帧图像的偏移量（索引偏移）
    }
    //不同的模态，读入程序也有差别
    if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW)
    //光流读入测试
        CHECK(ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                     offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str()));
    else
    //RGB读入测试
        CHECK(ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                    offsets, new_height, new_width, new_length, &datum, is_color, name_pattern_.c_str()));
    //数据的预处理
    //prefetch_data_是一个Blob类对象，在Caffe中，通过内部线程，将数据
    //读入等待队列，然后从等待队列中读取数据，在这一点上和tensorflow有些相似
    //tensorflow中，队列大小用户可以自己定义，在Caffe中默认为3
    //从下面的代码中，可以看到top_data = prefetch_data_
    const int crop_size = this->layer_param_.transform_param().crop_size();
    const int batch_size = this->layer_param_.video_data_param().batch_size();
    if (crop_size > 0){//crop
        top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
        this->prefetch_data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
    } else {
        top[0]->Reshape(batch_size, datum.channels(), datum.height(), datum.width());
        this->prefetch_data_.Reshape(batch_size, datum.channels(), datum.height(), datum.width());
    }
    LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," << top[0]->width();
    //为label分配内存
    top[1]->Reshape(batch_size, 1, 1, 1);
    this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
    //预处理后的数据分配内存
    vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
    this->transformed_data_.Reshape(top_shape);
}
//内部线程，读取数据
template <typename Dtype>
void VideoDataLayer<Dtype>::InternalThreadEntry(){

    Datum datum;
    CHECK(this->prefetch_data_.count());
    //获取top_data以及top_label指针，内容可修改
    Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
    Dtype* top_label = this->prefetch_label_.mutable_cpu_data();

    VideoDataParameter video_data_param = this->layer_param_.video_data_param();
    const int batch_size = video_data_param.batch_size();
    const int new_height = video_data_param.new_height();
    const int new_width = video_data_param.new_width();
    const int new_length = video_data_param.new_length();
    const int num_segments = video_data_param.num_segments();
    //获取训练集（测试集）视频的数量
    const int lines_size = lines_.size();

        bool is_color = !this->layer_param_.video_data_param().grayscale();
    for (int item_id = 0; item_id < batch_size; ++item_id){//大循环，循环batch_size
        CHECK_GT(lines_size, lines_id_);//当前读取视频的索引小于视频的数量
        //以下一段程序和DataLayerSetUp读入测试代码相类似
        vector<int> offsets;
        int average_duration = (int) lines_duration_[lines_id_] / num_segments;
        for (int i = 0; i < num_segments; ++i){
            if (this->phase_==TRAIN){
                if (average_duration >= new_length){
                    caffe::rng_t* frame_rng = static_cast<caffe::rng_t*>(frame_prefetch_rng_->generator());
                    int offset = (*frame_rng)() % (average_duration - new_length + 1);
                    offsets.push_back(offset+i*average_duration);
                } else {
                    offsets.push_back(0);
                }
            } else{
                if (average_duration >= new_length)
                offsets.push_back(int((average_duration-new_length+1)/2 + i*average_duration));
                else
                offsets.push_back(0);
            }
        }
        if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
        //读入数据，存放到datum中
            if(!ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                       offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str())) {
                continue;
            }
        } else{
            if(!ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                      offsets, new_height, new_width, new_length, &datum, is_color, name_pattern_.c_str())) {
                continue;
            }
        }
        //读取了batch_size = 1的数据 就开始预处理
        int offset1 = this->prefetch_data_.offset(item_id);//获取指针偏移
         this->transformed_data_.set_cpu_data(top_data + offset1);//预处理和源数据内存块重合
        this->data_transformer_->Transform(datum, &(this->transformed_data_));//预处理
        top_label[item_id] = lines_[lines_id_].second;//存入标签
        //LOG()

        //next iteration
        lines_id_++;//下一次视频索引
        if (lines_id_ >= lines_size) {
            DLOG(INFO) << "Restarting data prefetching from start.";
            lines_id_ = 0;
            if(this->layer_param_.video_data_param().shuffle()){
                ShuffleVideos();
            }
        }
    }
}

INSTANTIATE_CLASS(VideoDataLayer);
REGISTER_LAYER_CLASS(VideoData);
}

在video_data_layer中，用到了ReadSegmentFlowToDatum 和 ReadSegmentRGBToDatum分别将光流和RGB读入并转换成Datum这样的数据结构，在这此前，先来看看caffe.proto对Datum是怎么描述的

message Datum {
//Datum数据块的相关信息，channel height width
  optional int32 channels = 1;
  optional int32 height = 2;
  optional int32 width = 3;
  // the actual image data, in bytes
//存放数据字节(byte)类型的
  optional bytes data = 4;
  //标签，int32类型的
  optional int32 label = 5;
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;
  // If true data contains an encoded image that need to be decoded
  optional bool encoded = 7 [default = false];
}

从上面的定义可以看到，Datum用来存放一个实例的信息，数据(channel, height, width)包括标签的，下面，将一个实例图像(可能包含多张图像，根据channels的数值来确定)数据存入Datum

//filename：视频图像存放的路径
//label：该视频对应的标签
//offsets：采样图像，相对于第一帧视频的偏移量
//height，width图像的尺寸
//length: 从采样点开始，连续采样图像的帧数（对于光流来说，一帧包含x和y两张图像）
//datum: Datum的数据指针
//name_pattern: 命名模式
bool ReadSegmentFlowToDatum(const string& filename, const int label,
    const vector<int> offsets, const int height, const int width, const int length, Datum* datum,
    const char* name_pattern ){
    //用OpenCV读取图像数据
    cv::Mat cv_img_x, cv_img_y;
    string* datum_string;
    //图像文件名字符串
    char tmp[30];
    for (int i = 0; i < offsets.size(); ++i){//大循环，读取每一段中，采样图像
        int offset = offsets[i];//获取该段偏移量
        for (int file_id = 1; file_id < length+1; ++file_id){//连续读取图像，循环，
            sprintf(tmp,name_pattern, 'x', int(file_id+offset));//x图像文件名
            string filename_x = filename + "/" + tmp;//文件名+路径
            //读取图像（灰度形式读取）
            cv::Mat cv_img_origin_x = cv::imread(filename_x, CV_LOAD_IMAGE_GRAYSCALE);
            sprintf(tmp, name_pattern, 'y', int(file_id+offset));//y图像文件名
            string filename_y = filename + "/" + tmp;//文件名+路径
            cv::Mat cv_img_origin_y = cv::imread(filename_y, CV_LOAD_IMAGE_GRAYSCALE);
            if (!cv_img_origin_x.data || !cv_img_origin_y.data){
                LOG(ERROR) << "Could not load file " << filename_x << " or " << filename_y;
                return false;
            }
            //Resize图像尺度
            if (height > 0 && width > 0){
                cv::resize(cv_img_origin_x, cv_img_x, cv::Size(width, height));
                cv::resize(cv_img_origin_y, cv_img_y, cv::Size(width, height));
            }else{
                cv_img_x = cv_img_origin_x;
                cv_img_y = cv_img_origin_y;
            }
            //开始读取数据是，应给datum类指针赋值相关的值，以及data_string表示Datum数据块指针(可修改)
            if (file_id==1 && i==0){
                int num_channels = 2;
                //通道数 = 2（x和y） * length * 视频分段数
                datum->set_channels(num_channels*length*offsets.size());
                datum->set_height(cv_img_x.rows);
                datum->set_width(cv_img_x.cols);
                datum->set_label(label);
                datum->clear_data();
                datum->clear_float_data();
                datum_string = datum->mutable_data();
            }
            //遍历整个图像，将数据压入datum_string中，因此datum就获取了当前实例的数据
            for (int h = 0; h < cv_img_x.rows; ++h){
                for (int w = 0; w < cv_img_x.cols; ++w){
                    datum_string->push_back(static_cast<char>(cv_img_x.at<uchar>(h,w)));
                }
            }
            for (int h = 0; h < cv_img_y.rows; ++h){
                for (int w = 0; w < cv_img_y.cols; ++w){
                    datum_string->push_back(static_cast<char>(cv_img_y.at<uchar>(h,w)));
                }
            }
        }
    }
    return true;
}

网络测试

eval_net.py

import argparse
import os
import sys
import math
import cv2
import numpy as np
import multiprocessing
from sklearn.metrics import confusion_matrix

sys.path.append('.')
from pyActionRecog import parse_directory
from pyActionRecog import parse_split_file

from pyActionRecog.utils.video_funcs import default_aggregation_func

parser = argparse.ArgumentParser()
#数据集，ucf101和hmdb51
parser.add_argument('dataset', type=str, choices=['ucf101', 'hmdb51'])
#数据集中训练集和测试集的三种分类方式
parser.add_argument('split', type=int, choices=[1, 2, 3],
                    help='on which split to test the network')
#模态 光流或者RGB
parser.add_argument('modality', type=str, choices=['rgb', 'flow'])
#视频图像的根目录，其目录结构为frame_path/video_name/jpge（光流或者RGB）
parser.add_argument('frame_path', type=str, help="root directory holding the frames")
#测试网络deploy.prototxt
parser.add_argument('net_proto', type=str)
#网络权重caffemodel文件
parser.add_argument('net_weights', type=str)
#视频图像的命名形式
parser.add_argument('--rgb_prefix', type=str, help="prefix of RGB frames", default='img_')
parser.add_argument('--flow_x_prefix', type=str, help="prefix of x direction flow images", default='flow_x_')
parser.add_argument('--flow_y_prefix', type=str, help="prefix of y direction flow images", default='flow_y_')
#在测试时，将视频分为很多段，从每一段中采样一帧对每一个类别进行打分
#最后进行预测分数融合，获得最终的预测结果
parser.add_argument('--num_frame_per_video', type=int, default=25,
                    help="prefix of y direction flow images")
#保持所有测试集每一类的分数，方便后面的RGB和光流的结果融合
parser.add_argument('--save_scores', type=str, default=None, help='the filename to save the scores in')
#线程数，支持多线程
parser.add_argument('--num_worker', type=int, default=1)
#caffe路径的根目录
parser.add_argument("--caffe_path", type=str, default='./lib/caffe-action/', help='path to the caffe toolbox')
#GPU的使用个数
parser.add_argument("--gpus", type=int, nargs='+', default=None, help='specify list of gpu to use')
args = parser.parse_args()

print args

sys.path.append(os.path.join(args.caffe_path, 'python'))
from pyActionRecog.action_caffe import CaffeNet

# build neccessary information
print args.dataset
#parse_split_file和parse_directory用来解析数据集的信息的，
#比如每段视频的帧数，对应的类别，路径，数据集的个数之类的
split_tp = parse_split_file(args.dataset)
f_info = parse_directory(args.frame_path,
                         args.rgb_prefix, args.flow_x_prefix, args.flow_y_prefix)

gpu_list = args.gpus

#eval_video_list视频名字的列表
eval_video_list = split_tp[args.split - 1][1]

score_name = 'fc-action'


def build_net():
    global net
    my_id = multiprocessing.current_process()._identity[0] \
        if args.num_worker > 1 else 1
    if gpu_list is None:
        net = CaffeNet(args.net_proto, args.net_weights, my_id-1)
    else:
        net = CaffeNet(args.net_proto, args.net_weights, gpu_list[my_id - 1])

#测试网络的核心部分，输入表示视频的名字以及对应的类别（一段视频）
def eval_video(video):
    global net

    label = video[1]
    vid = video[0]
    #从f_info获取该视频对应的视频图像的路径
    video_frame_path = f_info[0][vid]
    #对于RGB图像只需要读入一次，对于光流图像有x光流和y光流，需要读入两次
    if args.modality == 'rgb':
        cnt_indexer = 1
    elif args.modality == 'flow':
        cnt_indexer = 2
    else:
        raise ValueError(args.modality)
    frame_cnt = f_info[cnt_indexer][vid]

    stack_depth = 0
    #对于RGB图像，连续采集一帧图像；对于光流连续采集5帧图像
    if args.modality == 'rgb':
        stack_depth = 1
    elif args.modality == 'flow':
        stack_depth = 5
    #在测试过程中，将一段视频分成很多段，每段采集一次
    step = (frame_cnt - stack_depth) / (args.num_frame_per_video-1)
    if step > 0:
        frame_ticks = range(1, min((2 + step * (args.num_frame_per_video-1)), frame_cnt+1), step)
    else:
        frame_ticks = [1] * args.num_frame_per_video

    assert(len(frame_ticks) == args.num_frame_per_video)

    frame_scores = []
    #循环每一次采集
    for tick in frame_ticks:
    #RGB和光流的采集模式不一样
        if args.modality == 'rgb':
            name = '{}{:05d}.jpg'.format(args.rgb_prefix, tick)
            frame = cv2.imread(os.path.join(video_frame_path, name), cv2.IMREAD_COLOR)
            #net.predict_single_frame进行测试,该函数实现了多尺度的分类
            #其输出(num_scale，101)

            scores = net.predict_single_frame([frame,], score_name, frame_size=(340, 256))
            frame_scores.append(scores)
            #光流的过程同RGB一样
        if args.modality == 'flow':
            frame_idx = [min(frame_cnt, tick+offset) for offset in xrange(stack_depth)]
            flow_stack = []
            for idx in frame_idx:
                x_name = '{}{:05d}.jpg'.format(args.flow_x_prefix, idx)
                y_name = '{}{:05d}.jpg'.format(args.flow_y_prefix, idx)
                flow_stack.append(cv2.imread(os.path.join(video_frame_path, x_name), cv2.IMREAD_GRAYSCALE))
                flow_stack.append(cv2.imread(os.path.join(video_frame_path, y_name), cv2.IMREAD_GRAYSCALE))
            scores = net.predict_single_flow_stack(flow_stack, score_name, frame_size=(340, 256))
            frame_scores.append(scores)

    print 'video {} done'.format(vid)
    sys.stdin.flush()
    return np.array(frame_scores), label
# 多线程
if args.num_worker > 1:
    pool = multiprocessing.Pool(args.num_worker, initializer=build_net)
    video_scores = pool.map(eval_video, eval_video_list) #列表到函数的映射
else:
    build_net()
    #通过这样的一个映射之后，video_scores尺寸为(num_videos, frame_ticks, num_scale, 101)
    video_scores = map(eval_video, eval_video_list)
#做对每一个视频的（frame_ticks, num_scale, 101)做一个结果融合（平均或者取最大值）
video_pred = [np.argmax(default_aggregation_func(x[0])) for x in video_scores]
video_labels = [x[1] for x in video_scores]

cf = confusion_matrix(video_labels, video_pred).astype(float)

cls_cnt = cf.sum(axis=1)
cls_hit = np.diag(cf)

cls_acc = cls_hit/cls_cnt

print cls_acc

print 'Accuracy {:.02f}%'.format(np.mean(cls_acc)*100)

if args.save_scores is not None:
    np.savez(args.save_scores, scores=video_scores, labels=video_labels)

在eval_net.py文件中，有这样的两个函数predict_single_flow_stack和 predict_single_frame，来进行一个多尺度的预测，下面以predict_single_flow_stack为例，函数所在的文件为pyActionRecong/action_caffe.py文件中，作者定义了一个CaffeNet类，predict_single_flow_stack为该类函数：

def predict_single_flow_stack(self, frame, score_name, over_sample=True, frame_size=None):

        if frame_size is not None:
            frame = fast_list2arr([cv2.resize(x, frame_size) for x in frame])
        else:
            frame = fast_list2arr(frame)
        #多尺度
        if over_sample:
            os_frame = flow_stack_oversample(frame, (self._sample_shape[2], self._sample_shape[3]))
        else:
            os_frame = fast_list2arr([frame])

        data = os_frame - np.float32(128.0)
        #网络输出
        self._net.blobs['data'].reshape(*data.shape)
        self._net.reshape()
        #前向传播获取每类得分
        out = self._net.forward(blobs=[score_name,], data=data)
        #其输出是(scale_num, 101)
        return out[score_name].copy()

Charel_CHEN

关注

4
点赞
踩
14

收藏

觉得还不错? 一键收藏
1
评论
temporal segment networks 核心代码

temporal segment networks 核心代码标签（空格分隔）：源码temporal segment networks 核心代码网络结构 prototxt视频图像的读入 video_data_layer网络测试网络结构 prototxt先来看看prototxt网络配置，以光流的为例，prototxt核心部分如下，这里只摘取了，数据输入层和...
复制链接

扫一扫