详细用法 :最大化TensorFlow* CPU性能
原文:Maximize TensorFlow* Performance on CPU: Considerations and Recommendations for Inference Workloads
export TF_ENABLE_ONEDNN_OPTS=1
intra_op_parallelism = number of physical core per socket
#每个插槽的物理内核数
inter_op_parallelism = number of sockets
get the number of physical core per socket and number of sockets on your platform
#!/bin/bash
total_cpu_cores=$(nproc)
number_sockets=$(($(grep "^physical id" /proc/cpuinfo | awk '{print $4}' | sort -un | tail -1)+1))
number_cpu_cores=$(( (total_cpu_cores/2) / number_sockets))
echo "number of CPU cores per socket: $number_cpu_cores";
echo "number of socket: $number_sockets";
data_format = NHWC
export TF_ENABLE_MKL_NATIVE_FORMAT=1 (or 0)
numactl --cpunodebind=0 --membind=0 python
numactl --cpunodebind=N --membind=N python
numactl --cpunodebind=0 --membind=0 python & numactl --cpunodebind=1 --membind=1 python
export OMP_NUM_THREADS=num physical cores
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_BLOCKTIME=0 (or 1)