IT博客汇
  • 首页
  • 精华
  • 技术
  • 设计
  • 资讯
  • 扯淡
  • 权利声明
  • 登录 注册

    A False-Sharing Test

    dutor发表于 2013-03-01 14:53:14
    love 0

      今天通过酷壳一篇推荐了解了一下 Cache 的“伪共享”(False Sharing). 写了小程序做了个简单的测试:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    
    //! false_sharing.cpp
    #include <iostream>
    #include <pthread.h>
    #include <sys/time.h>
    using namespace std;
     
    template <size_t NPAD, size_t NTH = 4, size_t NLOOP = (1<<30)>
    class FalseSharing {
    public:
      void run() {
        struct timeval b, e;
        gettimeofday(&b, NULL);
        pthread_t threads[NTH];
        for (size_t i = 0; i < NTH; ++i) {
          pthread_create(&threads[i], NULL, hook, reinterpret_cast<void*> (i));
        }
     
        for (size_t i = 0; i < NTH; ++i) {
          pthread_join(threads[i], NULL);
        }
        gettimeofday(&e, NULL);
        cout<<"padding bytes: "<<NPAD<<endl;;
        cout<<"thread count: "<<NTH<<endl;
        cout<<"loop count: "<<NLOOP<<endl;
        cout<<"elapsed(ms): "<< ((e.tv_sec-b.tv_sec)*1000000 + (e.tv_usec-b.tv_usec)) / 1000 <<endl;
        cout<<endl;
      }
     
    private:
      static void* hook(void *args) {
        size_t ith = reinterpret_cast<size_t> (args);
        for (size_t i = 0; i < NLOOP; ++i) {
          ++s[ith].n;
        }
        return NULL;
      }
    private:
      struct S {
        size_t  n;
        char    padding[NPAD];
      };
      static S s[NTH];
    };
     
    template <size_t NPAD, size_t NTH, size_t NLOOP>
    typename FalseSharing<NPAD, NTH, NLOOP>::S FalseSharing<NPAD, NTH, NLOOP>::s[NTH];
     
    int
    main() {
      FalseSharing<0> test1;
      test1.run();
      FalseSharing<56> test2;
      test2.run();
      return 0;
    };
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    
    $ cat /proc/cpuinfo | egrep 'model name|cache_alignment'
    model name	: Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz
    cache_alignment	: 64
    model name	: Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz
    cache_alignment	: 64
    model name	: Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz
    cache_alignment	: 64
    model name	: Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz
    cache_alignment	: 64
    $ g++ false_sharing.cpp -pthread
    $ time ./a.out 
    padding bytes   : 0
    thread count    : 4
    loop count      : 1073741824
    elapsed(ms)     : 12638
     
    padding bytes   : 56
    thread count    : 4
    loop count      : 1073741824
    elapsed(ms)     : 3888
     
    real	0m16.530s
    user	1m3.688s
    sys	0m0.044s

    PS. 上面结果是在一台2核4线程的i7-3520笔记本上面测得的,在一台4核16线程的E5520(2.27GHz)机器上,结果更加触目惊心:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    
    padding bytes: 0
    thread count: 4
    loop count: 1073741824
    elapsed(ms): 13646
     
    padding bytes: 56
    thread count: 4
    loop count: 1073741824
    elapsed(ms): 3548
     
     
    padding bytes: 0
    thread count: 16
    loop count: 1073741824
    elapsed(ms): 43848
     
    padding bytes: 56
    thread count: 16
    loop count: 1073741824
    elapsed(ms): 5051

    UPDATE
      有意思的是,根据Felix同学的测试性能差距并没有这么大,详情请移步到这里。



沪ICP备19023445号-2号
友情链接