多线程共享数据时cache不对齐对performance的影响

最新推荐文章于 2020-09-15 11:33:28 发布

Luffy Dong

最新推荐文章于 2020-09-15 11:33:28 发布

阅读量290

点赞数

本文链接：https://blog.csdn.net/dshj2007/article/details/87943744

版权

LMAX是一家英国的外汇交易公司，该公司开发了基于JVM的并发开源框架Disruptor，它能够在一个线程里每秒处理6百万订单，业务逻辑处理器完全运行在内存中，使用事件源驱动方式，其发布的开源架构介绍文章中讲到用内存填充的方式规避cache line对多线程并发的影响。

文章中提到了一个叫伪共享的概念: https://lmax-exchange.github.io/disruptor/files/Disruptor-1.0.pdf

什么是伪共享呢？如下图

两个线程分别跑在core1和core2上，各自拥有变量X和Y，如果X和Y在相同的cache line上，由于cache一致性的原因，这两个变量只能同时读不能同时写，这种场景叫做伪共享。

我们来验证下对于performance来说cache line不对齐到底有多少影响

在服务器上 cat /sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size 得到dcache的cache line为64字节

创建多个线程分别访问一个全局long数组对应的不同元素

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sched.h>  
#include <pthread.h>
#include <unistd.h> 
#include <errno.h>
#include <string.h>
#include <time.h>
#include <sys/resource.h>
#include <unistd.h>

struct v_padding
{
    long p1; long p2; long p3; long p4; long p5; long p6; long p7;
    long value;
    long p9; long p10; long p11; long p12; long p13; long p14; long p15;
};
static struct v_padding longs[10];
static int t_create = 0;
unsigned long iteration = 100L*1000L;
unsigned long time_cost[10]={0};
pthread_barrier_t g_barrier;

void *pt_fn_other(void * data)
{
    int nice = 0;
    int num;
    unsigned long i = iteration;
    struct timeval start;
    struct timeval end;
    unsigned long time;

//  printf(" cpu %d\n",sched_getcpu());

    num  = *(int *)data;
//  setpriority(PRIO_PROCESS, 0, nice);

    pthread_barrier_wait(&g_barrier);
    gettimeofday(&start,NULL);

    while(--i != 0)
    {
        longs[num].value = 0;
    }
    
    gettimeofday(&end,NULL);
    time = 1000000 * (end.tv_sec-start.tv_sec)+ end.tv_usec-start.tv_usec;
//  printf("    num = %d time %ld \n", num, time);
    time_cost[num] = time;
    return ((void *)0);
}
 
int pcreate_other_thread(pthread_t *tid, void *data)
{
    pthread_attr_t  attr;
    int ret = 0;
    pthread_attr_init(&attr);

    if((ret = pthread_attr_setinheritsched(&attr,PTHREAD_EXPLICIT_SCHED)))
    {
        printf("pthread_attr_setinheritsched failed.\n");
    }

    if((ret = pthread_attr_setschedpolicy(&attr,SCHED_OTHER)))
    {
        printf("pthread_attr_setschedpolicy failed.\n");
    }

    if((ret = pthread_create(tid, &attr, pt_fn_other, (void *)data)))
    {
        printf("pthread_create failed with error.\n");
    }

    return ret;
}
 
int main(void)
{
    struct timeval start;
    struct timeval end;
    unsigned long time = 0;
    unsigned long max_time = 0;
    unsigned long p_time = 0;
    char exit;
    pthread_t tid[10];
    int ret;
    int i = 0;
    int threads_num = 5;
    int num[10] = {0,1,2,3,4,5,6,7,8,9};
 
    for(threads_num = 1; threads_num < 11; threads_num++)
    {   
        pthread_barrier_init(&g_barrier, NULL, threads_num);
        gettimeofday(&start,NULL);
        for(i = 0; i < threads_num; i++)
        {
            if((ret = pcreate_other_thread(&tid[i], &num[i])))
            {
                if(ret < 0){
                    printf("pcreate_other_thread thread failed %d.\n",ret);
                    return ret;
                }
            }
        }

        time = 0;
        for(i = 0; i < threads_num; i++){
            pthread_join(tid[i], NULL);
            time += time_cost[i];
            if(i==0)
            {
                max_time = time_cost[0];
                continue;
            }else{
                if(time_cost[i] > max_time)
                    max_time = time_cost[i];
            }
        }
        gettimeofday(&end,NULL);
        pthread_barrier_destroy(&g_barrier);
        p_time = 1000000 * (end.tv_sec-start.tv_sec)+ end.tv_usec-start.tv_usec;
        printf("threads_num = %d main process time %ld average thread time %ld max     thread time %ld \n", threads_num, p_time, time/threads_num, max_time);
    }

/*  while(exit != 'q')
    {
        scanf("%c", &exit);
    }
*/
    printf("exit successed.\n");
    return ret;   
}

在struct v_padding数据结构里注释掉填充即为伪共享的场景

在2.5Ghz Intel 服务器上运行结果如下

在使用共享机制时线程数量从1到10增长，单个线程的平均运行时间为

615 640 638 657 570 651 637 583 595 531

在不使用共享机制时线程数量从1到10增长，单个线程的平均运行时间为

585 1400 1853 2608 2686 3027 2494 2677 2688 3059

再用perf stat -e cache-misses查看下两者cache miss的区别

不使用共享机制时的数据为30,005 cache-misses

使用共享机制后的数据为19,865 cache-misses

可cache line对于performance的影响还是比较大的

由于代码整体运行环境的影响，我们无法准确说出cache miss影响performance的绝对值，但是从多次测试数据来说，影响的相对值还是很可观的。

Luffy Dong

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
多线程共享数据时cache不对齐对performance的影响

LMAX是一家英国的外汇交易公司，该公司开发了基于JVM的并发开源框架Disruptor，它能够在一个线程里每秒处理6百万订单，业务逻辑处理器完全运行在内存中，使用事件源驱动方式，其发布的开源架构介绍文章中讲到用内存填充的方式规避cache line对多线程并发的影响。文章中提到了一个叫伪共享的概念: https://lmax-exchange.github.io/disruptor/file...
复制链接

扫一扫