本文首发于个人博客https://kezunlin.me/post/7a6ba82e/,欢迎阅读!
speed up opencv image processing with openmp
Series
- Part 1: compile opencv on ubuntu 16.04
- Part 2: compile opencv with CUDA support on windows 10
- Part 3: opencv mat for loop
- Part 4: speed up opencv image processing with openmp
Guide
config
- linux/window: cmake with
CXX_FLAGS=-fopenmp
- window VS: VS also support openmp,
C/C | Language | /openmp
usage
#include <omp.h>
#pragma omp parallel for
for loop ...
code
#include <iostream>
#include <omp.h>
int main()
{
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < 8; i )
{
printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
}
printf("\n");
return 0;
}
/*
i = 0, I am Thread 0
i = 1, I am Thread 0
i = 4, I am Thread 2
i = 5, I am Thread 2
i = 6, I am Thread 3
i = 7, I am Thread 3
i = 2, I am Thread 1
i = 3, I am Thread 1
*/
CMakeLists.txt
use CXX_FLAGS=-fopenmp
in CMakeLists.txt
cmake_minimum_required(VERSION 3.0.0)
project(hello)
find_package(OpenMP REQUIRED)
if(OPENMP_FOUND)
message("OPENMP FOUND")
message([main] " OpenMP_C_FLAGS=${OpenMP_C_FLAGS}") # -fopenmp
message([main] " OpenMP_CXX_FLAGS}=${OpenMP_CXX_FLAGS}") # -fopenmp
message([main] " OpenMP_EXE_LINKER_FLAGS=${OpenMP_EXE_LINKER_FLAGS}") # ***
# no use for xxx_INCLUDE_DIRS and xxx_libraries for OpenMP
message([main] " OpenMP_INCLUDE_DIRS=${OpenMP_INCLUDE_DIRS}") # ***
message([main] " OpenMP_LIBRARIES=${OpenMP_LIBRARIES}") # ***
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
add_executable(hello hello.cpp)
#target_link_libraries(hello xxx)
options
or use g hello.cpp -fopenmp
to compile
view demo
list dynamic dependencies (ldd)
ldd hello
linux-vdso.so.1 => (0x00007ffd71365000)
libstdc .so.6 => /usr/lib/x86_64-linux-gnu/libstdc .so.6 (0x00007f8ea7f00000)
libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1 (0x00007f8ea7cde000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f8ea7914000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f8ea760b000)
/lib64/ld-linux-x86-64.so.2 (0x00007f8ea8282000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f8ea73f5000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f8ea71f1000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f8ea6fd4000)
libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1
list names (nm)
nm hello
0000000000602080 B __bss_start
0000000000602190 b completed.7594
U __cxa_atexit@@GLIBC_2.2.5
0000000000602070 D __data_start
0000000000602070 W data_start
0000000000400b00 t deregister_tm_clones
0000000000400b80 t __do_global_dtors_aux
0000000000601df8 t __do_global_dtors_aux_fini_array_entry
0000000000602078 d __dso_handle
0000000000601e08 d _DYNAMIC
0000000000602080 D _edata
0000000000602198 B _end
0000000000400d44 T _fini
0000000000400ba0 t frame_dummy
0000000000601de8 t __frame_dummy_init_array_entry
0000000000400f18 r __FRAME_END__
0000000000602000 d _GLOBAL_OFFSET_TABLE_
0000000000400c28 t _GLOBAL__sub_I_main
w __gmon_start__
0000000000400d54 r __GNU_EH_FRAME_HDR
U GOMP_parallel@@GOMP_4.0
U __gxx_personality_v0@@CXXABI_1.3
00000000004009e0 T _init
0000000000601df8 t __init_array_end
0000000000601de8 t __init_array_start
0000000000400d50 R _IO_stdin_used
w _ITM_deregisterTMCloneTable
w _ITM_registerTMCloneTable
0000000000601e00 d __JCR_END__
0000000000601e00 d __JCR_LIST__
w _Jv_RegisterClasses
0000000000400d40 T __libc_csu_fini
0000000000400cd0 T __libc_csu_init
U __libc_start_main@@GLIBC_2.2.5
0000000000400bc6 T main
0000000000400c3d t main._omp_fn.0
U omp_get_num_threads@@OMP_1.0
U omp_get_thread_num@@OMP_1.0
0000000000400b40 t register_tm_clones
0000000000400ad0 T _start
0000000000602080 d __TMC_END__
0000000000400bea t _Z41__static_initialization_and_destruction_0ii
U _ZNSolsEPFRSoS_E@@GLIBCXX_3.4
U _ZNSt8ios_base4InitC1Ev@@GLIBCXX_3.4
U _ZNSt8ios_base4InitD1Ev@@GLIBCXX_3.4
0000000000602080 B _ZSt4cout@@GLIBCXX_3.4
U _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_@@GLIBCXX_3.4
0000000000602191 b _ZStL8__ioinit
U _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c@@GLIBCXX_3.4
omp_get_num_threads
,omp_get_thread_num
OpenMP Introduction
OpenMP的指令格式
#pragma omp directive [clause[clause]…]
#pragma omp parallel private(i, j)
parallel
is directive,private
is clause
directive
- parallel,用在一个代码段之前,表示这段代码将被多个线程并行执行
- for,用于for循环之前,将循环分配到多个线程中并行执行,必须保证每次循环之间无相关性。
- parallel for, parallel 和 for语句的结合,也是用在一个for循环之前,表示for循环的代码将被多个线程并行执行。
- sections,用在可能会被并行执行的代码段之前
- parallel sections,parallel和sections两个语句的结合
- critical,用在一段代码临界区之前
- single,用在一段只被单个线程执行的代码段之前,表示后面的代码段将被单线程执行。
- flush,
- barrier,用于并行区内代码的线程同步,所有线程执行到barrier时要停止,直到所有线程都执行到barrier时才继续往下执行。
- atomic,用于指定一块内存区域被制动更新
- master,用于指定一段代码块由主线程执行
- ordered, 用于指定并行区域的循环按顺序执行
- threadprivate, 用于指定一个变量是线程私有的。
parallel for
OpenMP 对可以多线程化的循环有如下五个要求:
- 循环的变量变量(就是i)必须是有符号整形,其他的都不行。
- 循环的比较条件必须是< <= > >=中的一种
- 循环的增量部分必须是增减一个不变的值(即每次循环是不变的)。
- 如果比较符号是< <=,那每次循环i应该增加,反之应该减小
- 循环必须是没有奇奇怪怪的东西,不能从内部循环跳到外部循环,goto和break只能在循环内部跳转,异常必须在循环内部被捕获。
如果你的循环不符合这些条件,那就只好改写了.
avoid race condition