NNPACK详解

最新推荐文章于 2024-05-13 09:37:43 发布

算法学习者

最新推荐文章于 2024-05-13 09:37:43 发布

阅读量6.5k

点赞数 3

分类专栏： DL

DL 专栏收录该内容

158 篇文章 3 订阅

订阅专栏

 
 NNPACK详解 

 
 一：NNPACK简介 

 
 NNPACK由facebook开发，是一个加速神经网络计算的加速包，NNPACK可以在多核CPU平台上提高卷积层计算性能。 
 NNPACK采用的快速卷积算法是基于Fourier transform 和 Winograd transform算法。 

 
 二：前向计算的性能 

 
 如下图（Intel Core i7 6700K vs BVLC Caffe master branch） 

 
 链接： 
 https://github.com/Maratyszcza/NNPACK 

Library	Caffe	NNPACK	NNPACK	NNPACK
Algorithm	im2col + sgemm	FFT-8x8	FFT-16x16	Winograd F(6x6, 3x3)
AlexNet:conv2	315 ms	129 ms	86 ms	N/A
AlexNet:conv3	182 ms	87 ms	44 ms	70 ms
AlexNet:conv4	264 ms	109 ms	56 ms	89 ms
AlexNet:conv5	177 ms	77 ms	40 ms	64 ms
VGG-A:conv1	255 ms	303 ms	260 ms	404 ms
VGG-A:conv2	902 ms	369 ms	267 ms	372 ms
VGG-A:conv3.1	566 ms	308 ms	185 ms	279 ms
VGG-A:conv3.2	1091 ms	517 ms	309 ms	463 ms
VGG-A:conv4.1	432 ms	228 ms	149 ms	188 ms
VGG-A:conv4.2	842 ms	402 ms	264 ms	329 ms
VGG-A:conv5	292 ms	141 ms	83 ms	114 ms
OverFeat:conv2	424 ms	158 ms	73 ms	N/A
OverFeat:conv3	250 ms	69 ms	74 ms	54 ms
OverFeat:conv4	927 ms	256 ms	272 ms	173 ms
OverFeat:conv5	1832 ms	466 ms	524 ms	315 ms

 
 三：NNPACK支持的层 

卷积层（Convolutional layer）
全连接层（Fully-connected layer）
池化层（Max pooling layer）
ReLU layer
Softmax layer

 
 四：NNPACK 编译 （Linux OS） 

 
 1、编译安装 PeachPy

[cpp]view plaincopy 
        
 git clone https://github.com/Maratyszcza/PeachPy.git  
 cd PeachPy  
 [sudo] pip install --upgrade -r requirements.txt  
 python setup.py generate  
 [sudo] pip install --upgrade .  

 
 2、安装ninja 和 ninja-syntax python  
 模块

[cpp]view plaincopy 
        
 sudo apt-get install ninja-build || brew install ninja  
 [sudo] pip install ninja-syntax  

 
 3、下载编译NNPACK 

[cpp]view plaincopy 
        
 git clone --recursive https://github.com/Maratyszcza/NNPACK.git  
 cd NNPACK  
 python ./configure.py  
 ninja  

 
 注意： 

  编译nnpack过程如果出现一些找不到头文件等情况，一般是需要下载第三方库。在nnpack的包中有对于的目录third-party，分别下载，放到对应得目录中，并分别编译。 

 
 FXdiv @ ba65e0d：https://github.com/Maratyszcza/FXdiv/tree/ba65e0de6c48d91be1cd578755f6db172774fe53 

 
 gtest-1.7.0 @ c994585 
 ： 

 
 https://github.com/google/googletest/tree/c99458533a9b4c743ed51537e25989ea55944908 

 
 pthreadpool @ 17747d7 
 ： 

 
 https://github.com/Maratyszcza/pthreadpool/tree/17747d7ef2d54dc15af63f9d05a261e04526ba5d 

 
 四：测试 

 
 NNPACK编译完成之后，在NNPACK-master/bin目录下有测试卷积、全连接等可执行程序。 

 
 例如，测试卷积输入通道 16 ，输出通道 16 ， 输入图像180*180，kernel 3*3 ，迭代100次， 

 
 执行结果： 

 
 五、使用NNPACK 实现卷积  

input channels: 1
output channels:1
input size：4*4
kernel size：3*3

 
 1、 
 代码（conv_nnpack.c） 
 ： 

[cpp] view plain copy 
    
 #include<stdio.h>  
 #include<string.h>  
 #include<stdlib.h>  
 #include<unistd.h>  
 #include<sys/time.h>  
 #include<nnpack.h>  
 int main(int argc , char** argv)  
 {  
     //init nnpack  
     enum nnp_status init_status = nnp_initialize();  
     if (init_status != nnp_status_success) {  
         fprintf(stderr, "NNPACK initialization failed: error code %d\n", init_status);  
         exit(EXIT_FAILURE);  
     }  
   
         enum nnp_convolution_algorithm algorithm = nnp_convolution_algorithm_auto;  
         const size_t batch_size = 1;  
         const size_t input_channels = 1;  
         const size_t output_channels = 1;  
         const struct nnp_padding input_padding = { 0, 0, 0, 0 };  
         const struct nnp_size input_size ={ 4, 4 };  
         const struct nnp_size kernel_size = { 3, 3 },;  
         const struct nnp_size output_size = {  
                 .width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1,  
                 .height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height) / output_subsampling.height + 1  
         };  
     int iter=1;  
   
     //malloc memory for input, kernel, output, bias  
     float* input = (float*)malloc(batch_size * input_channels *input_size.height *input_size.width * sizeof(float));  
     float* kernel = (float*)malloc(input_channels * output_channels * kernel_size.height * kernel_size.width * sizeof(float));  
     float* output = (float*)malloc(batch_size* output_channels * output_size.height * output_size.width * sizeof(float));  
     float* bias = (float*)malloc(output_channels * sizeof(float));  
   
     pthreadpool_t threadpool=NULL;  
     //flase:only one thread, true: mutiple threads   
     if (false) {   
                 threadpool = pthreadpool_create(options.threads);  
                 printf("Threads: %zu\n", pthreadpool_get_threads_count(threadpool));  
     }  
   
     struct nnp_profile computation_profile;//use for compute time;  
     //init input data  
     int i,j;  
     for(int c=0; c<input_channels;c++ ){  
         for(i=0; i<input_size.height; i++){  
             for(j=0; j<input_size.width; j++){  
                 input[c*input_size.height*input_size.width+i*input_size.width+j] = (i*input_size.width+j)*1.0;  
             }  
         }  
     }  
   
     //init kernel data  
     for(int i=0; i<output_channels;i++ ){  
         for(j=0; j<input_channels*kernel_size.height*kernel_size.width; j++){  
             kernel[i*input_channels*kernel_size.height*kernel_size.width+j] = 1.0;            
         }  
     }  
   
     //init bias data  
     for(int i=0; i<output_channels;i++ ){  
         bias[i] = 1.0;  
     }  
   
     //execute conv  
     struct timeval conv_start;  
     struct timeval conv_end;  
     gettimeofday(&conv_start,NULL);  
     for(int i=0;i<iter;i++){  
         nnp_convolution_output(algorithm,  
                     batch_size,  
                     input_channels,  
                     output_channels,  
                     input_size,  
                     input_padding,  
                     kernel_size,  
                     input,  
                     kernel,  
                     bias,  
                     output,  
                     threadpool,  
                     &computation_profile);  
     }  
     gettimeofday(&conv_end,NULL);  
     //printf ouput data  
     for(i=0;i<output_channels; i++){  
         for(j=0;j<output_size.height*output_size.width; j++){  
             printf("%f\t",output[i*output_size.height*output_size.width+j]);      
         }  
         printf("\n");     
     }  
     float conv_time_use = 1000.0*(float)(conv_end.tv_sec-conv_start.tv_sec)+(float)(conv_end.tv_usec-conv_start.tv_usec)/1000.0;  
     printf("conv Time use = %f(ms)\n",conv_time_use);  
     printf("conv mean Time use = %f(ms) / iter\n",conv_time_use/iter);  
     return 0;  
 }  

 
 2、编译 

 
 编译，要链接 
  -lnnpack 
  和 
 -lphterad 
 。-lphterad是第三方库pthreadpool使用pthread创建线程，pthreadpool是根据平台CPU的核数，创建对应数量的线程。此处代码false使用一个线程。 

 
 3、输出结果： 

 
 六：NNPACK与 im2col+sgemm卷积性能对比 

im2col+sgemm使用openblas
intput channels : 16
output channels :16
input size : 360*360
kernel size: 2*2 , 3*3, 5*5

 
 如下图： 

  图 nnpack vs im2col_segemm 

	2	3	5	10
nnpack	6.69ms	7.38ms	9.71ms	26.44ms
im2col_sgemm	37.83ms	86.95ms	236.91ms	929.66ms

   表 nnpack vs im2col_segemm 
 

算法学习者

关注

3
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
NNPACK详解

NNPACK详解一：NNPACK简介NNPACK由facebook开发，是一个加速神经网络计算的加速包，NNPACK可以在多核CPU平台上提高卷积层计算性能。NNPACK采用的快速卷积算法是基于Fourier transform 和 Winograd transform算法。二：前向计算的性能如下图（Intel Core i7 6700K v
复制链接

扫一扫