TNN:由腾讯优图实验室打造,移动端高性能、轻量级推理框架,同时拥有跨平台、高性能、模型压缩、代码裁剪等众多突出优势。TNN框架在原有Rapidnet、ncnn框架的基础上进一步加强了移动端设备的支持以及性能优化,同时也借鉴了业界主流开源框架高性能和良好拓展性的优点。目前TNN已经在手Q、微视、P图等应用中落地,欢迎大家参与协同共建,促进TNN推理框架进一步完善。
本系列文章为对腾讯TNN的深度源码级别解读,希望通过对一个推理框架的完整描述,来增强读者对于神经网络设计、实现到优化的方方面面。
CPU省电模式。
在多CPU环境下可以通过设置参数powersave来控制耗电情况。目前提供提供了3种模式。主要是通过对CPU列表进行主频排序,并选取一些特定频率的CPU工作,并设置CPU亲和性来减少调度开销。
0:表示正常模式。
1:优先使用低频CPU模式。
2:使用高频CPU模式。
Status CpuUtils::SetCpuPowersave(int powersave) {#ifdef __ANDROID__ static std::vector sorted_cpuids; static int little_cluster_offset = 0; static int cpucount = GetCpuCount(); if (sorted_cpuids.empty()) { // 0 ~ g_cpucount sorted_cpuids.resize(cpucount); for (int i = 0; i < cpucount; i++) { sorted_cpuids[i] = i; } // descent sort by max frequency SortCpuidByMaxFrequency(sorted_cpuids, &little_cluster_offset); } if (little_cluster_offset == 0 && powersave != 0) { powersave = 0; fprintf(stderr, "SMP cpu powersave not supported"); } // prepare affinity cpuid std::vector cpuids; if (powersave == 0) { cpuids = sorted_cpuids; } else if (powersave == 1) { //使用低频率设备 cpuids = std::vector(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end()); } else if (powersave == 2) { //使用高频设备 cpuids = std::vector(sorted_cpuids.begin(), sorted_cpuids.begin() + little_cluster_offset); } else { fprintf(stderr, "powersave %d not supported", powersave); return TNNERR_SET_CPU_AFFINITY; }#ifdef _OPENMP // set affinity for each thread int num_threads = cpuids.size(); omp_set_num_threads(num_threads); std::vector ssarets(num_threads, 0);#pragma omp parallel for for (int i = 0; i < num_threads; i++) { ssarets[i] = SetSchedAffinity(cpuids); } for (int i = 0; i < num_threads; i++) { if (ssarets[i] != 0) { return TNNERR_SET_CPU_AFFINITY; } }#else int ssaret = SetSchedAffinity(cpuids); if (ssaret != 0) { return TNNERR_SET_CPU_AFFINITY; }#endif return TNN_OK;#else // TODO (void)powersave; // Avoid unused parameter warning. return TNNERR_SET_CPU_AFFINITY;#endif}
- 获得CPU数
这里的的CPU数并不是指的物理CPU数而是逻辑CPU个数,一般的双路CPU(2个物理CPU)逻辑CPU可以是20、40等,不同服务器配置会不同。获得CPU数原理较为简单,只需要通过读取/proc/cpuinfo中以processor开头的行并统计总函数就可以实现。读者同样可以在shell环境下执行
grep processor /proc/cpuinfo获得结果。
static int GetCpuCount() { // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) return 1; int count = 0; char line[1024]; while (!feof(fp)) { char* s = fgets(line, 1024, fp); if (!s) break; if (memcmp(line, "processor", 9) == 0) { count++; } } fclose(fp); if (count < 1) count = 1; return count;}
- 设置CPU亲和性
SMP(Symmetric Multi-Processing对称多处理)架构下,通过设置CPU affinity (一种调度属性scheduler property),实现进程和CPU绑定,调度器会试图保持进程在相同的CPU上运行,减少进程在处理器之间频繁迁移,降低负载和调度开销。
static int SetSchedAffinity(const std::vector& cpuids) {#if defined(__ANDROID__) || defined(__linux__) // cpu_set_t definition // ref // http://stackoverflow.com/questions/16319725/android-set-thread-affinity#define TNN_CPU_SETSIZE 1024#define TNN_NCPUBITS (8 * sizeof(unsigned long)) typedef struct { unsigned long __bits[TNN_CPU_SETSIZE / TNN_NCPUBITS]; } cpu_set_t;#define TNN_CPU_SET(cpu, cpusetp) ((cpusetp)->__bits[(cpu) / TNN_NCPUBITS] |= (1UL << ((cpu) % TNN_NCPUBITS)))#define TNN_CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t)) // set affinity for thread#ifdef __GLIBC__ pid_t pid = syscall(SYS_gettid);#else#ifdef PI3 pid_t pid = getpid();#else pid_t pid = gettid();#endif#endif cpu_set_t mask; TNN_CPU_ZERO(&mask); for (int i = 0; i < (int)cpuids.size(); i++) { TNN_CPU_SET(cpuids[i], &mask); } int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); if (syscallret) { fprintf(stderr, "syscall error %d", syscallret); return -1; }#endif return 0;}
总的CPU亲和性封装接口,目前支持Android和Linux上进行亲和性设置。返回设置成功或者失败状态。
Status CpuUtils::SetCpuAffinity(const std::vector& cpu_list) {#if defined(__ANDROID__) || defined(__linux__) if (0 != SetSchedAffinity(cpu_list)) { return TNNERR_SET_CPU_AFFINITY; } return TNN_OK;#else return TNNERR_SET_CPU_AFFINITY;#endif}
- 基于主频对对CPU进行排序
- 获得每个CPU的最大工作频率。该函数工作在Android设备上。方法也较为简单,通过读取对应的系统设备信息文件则可以获得:/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state。
static int GetMaxFreqOfCpu(int cpuid) { // first try, for all possible cpu char path[256]; snprintf(path, 256, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); FILE* fp = fopen(path, "rb"); if (!fp) { // second try, for online cpu snprintf(path, 256, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); fp = fopen(path, "rb"); if (fp) { int max_freq_khz = 0; while (!feof(fp)) { int freq_khz = 0; int nscan = fscanf(fp, "%d %*d", &freq_khz); if (nscan != 1) break; if (freq_khz > max_freq_khz) max_freq_khz = freq_khz; } fclose(fp); if (max_freq_khz != 0) return max_freq_khz; fp = NULL; } if (!fp) { // third try, for online cpu snprintf(path, 256, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); fp = fopen(path, "rb"); if (!fp) return -1; int max_freq_khz = -1; fscanf(fp, "%d", &max_freq_khz); fclose(fp); return max_freq_khz; } } int max_freq_khz = 0; while (!feof(fp)) { int freq_khz = 0; int nscan = fscanf(fp, "%d %*d", &freq_khz); if (nscan != 1) break; if (freq_khz > max_freq_khz) max_freq_khz = freq_khz; } fclose(fp); return max_freq_khz;}
- 基于冒泡排序对所有CPU按主频进行排序。排序后的列表放在cpuids中,排序方法为降序排列。little_cluster_offset 返回第一个小于(max_freq,min_freq)/2对应的偏移位置。
static int SortCpuidByMaxFrequency(std::vector& cpuids, int* little_cluster_offset) { const int cpu_count = cpuids.size(); *little_cluster_offset = 0; if (cpu_count == 0) return 0; std::vector cpu_max_freq_khz; cpu_max_freq_khz.resize(cpu_count); for (int i = 0; i < cpu_count; i++) { int max_freq_khz = GetMaxFreqOfCpu(i); // printf("%d max freq = %d khz", i, max_freq_khz); cpuids[i] = i; cpu_max_freq_khz[i] = max_freq_khz; } // sort cpuid as big core first // simple bubble sort for (int i = 0; i < cpu_count; i++) { for (int j = i + 1; j < cpu_count; j++) { if (cpu_max_freq_khz[i] < cpu_max_freq_khz[j]) { // swap int tmp = cpuids[i]; cpuids[i] = cpuids[j]; cpuids[j] = tmp; tmp = cpu_max_freq_khz[i]; cpu_max_freq_khz[i] = cpu_max_freq_khz[j]; cpu_max_freq_khz[j] = tmp; } } } // SMP int mid_max_freq_khz = (cpu_max_freq_khz.front() + cpu_max_freq_khz.back()) / 2; if (mid_max_freq_khz == cpu_max_freq_khz.back()) return 0; for (int i = 0; i < cpu_count; i++) { if (cpu_max_freq_khz[i] < mid_max_freq_khz) { *little_cluster_offset = i; break; } } return 0;}
附录
超线程技术(Hyper-Threading):利用硬件指令将逻辑内核(CPU core)模拟成物理芯片,使得单个处理器支持使用线程级并行计算,提高的CPU的运行效率。双核四线程等指的就是支持超线程技术的CPU.
物理CPU:实际主板上安装的CPU数。当主板上安装了1个4核CPU时,物理CPU个数为1。
逻辑CPU:一颗CPU可以有多核,使用超线程技术(HT), 可以 Double CPU Core;
转载声明:
本文为头条号作者深度视野整理发布,任何个人或组织未经授权不得转载。
违规转载将追究法律责任。