With your programs, I get
(write) Bandwidth = 6.076 GB/s
(read) Bandwidth = 10.916 GB/s
on a desktop (Core i7, x86-64, GCC 4.9, GNU libc 2.19) machine with six 2GB DIMMs. (I don't have any more detail than that to hand, sorry.)
However, this program reports write bandwidth of 12.209 GB/s
:
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <emmintrin.h>
static void
nt_memset(char *buf, unsigned char val, size_t n)
{
/* this will only work with aligned address and size */
assert((uintptr_t)buf % sizeof(__m128i) == 0);
assert(n % sizeof(__m128i) == 0);
__m128i xval = _mm_set_epi8(val, val, val, val,
val, val, val, val,
val, val, val, val,
val, val, val, val);
for (__m128i *p = (__m128i*)buf; p < (__m128i*)(buf + n); p++)
_mm_stream_si128(p, xval);
_mm_sfence();
}
/* same main() as your write test, except calling nt_memset instead of memset */
The magic is all in _mm_stream_si128
, aka the machine instruction movntdq
, which writes a 16-byte quantity to system RAM, bypassing the cache (the official jargon for this is "non-temporal store"). I think this pretty conclusively demonstrates that the performance difference is all about the cache behavior.
N.B. glibc 2.19 does have an elaborately hand-optimized memset
that makes use of vector instructions. However, it does not use non-temporal stores. That's probably the Right Thing for memset
; in general, you clear memory shortly before using it, so you want it to be hot in the cache. (I suppose an even cleverer memset
might switch to non-temporal stores for really huge block clear, on the theory that you could not possibly want all of that in the cache, because the cache simply isn't that big.)
Dump of assembler code for function memset:
=> 0x00007ffff7ab9420 <+0>: movd %esi,%xmm8
0x00007ffff7ab9425 <+5>: mov %rdi,%rax
0x00007ffff7ab9428 <+8>: punpcklbw %xmm8,%xmm8
0x00007ffff7ab942d <+13>: punpcklwd %xmm8,%xmm8
0x00007ffff7ab9432 <+18>: pshufd $0x0,%xmm8,%xmm8
0x00007ffff7ab9438 <+24>: cmp $0x40,%rdx
0x00007ffff7ab943c <+28>: ja 0x7ffff7ab9470 <memset+80>
0x00007ffff7ab943e <+30>: cmp $0x10,%rdx
0x00007ffff7ab9442 <+34>: jbe 0x7ffff7ab94e2 <memset+194>
0x00007ffff7ab9448 <+40>: cmp $0x20,%rdx
0x00007ffff7ab944c <+44>: movdqu %xmm8,(%rdi)
0x00007ffff7ab9451 <+49>: movdqu %xmm8,-0x10(%rdi,%rdx,1)
0x00007ffff7ab9458 <+56>: ja 0x7ffff7ab9460 <memset+64>
0x00007ffff7ab945a <+58>: repz retq
0x00007ffff7ab945c <+60>: nopl 0x0(%rax)
0x00007ffff7ab9460 <+64>: movdqu %xmm8,0x10(%rdi)
0x00007ffff7ab9466 <+70>: movdqu %xmm8,-0x20(%rdi,%rdx,1)
0x00007ffff7ab946d <+77>: retq
0x00007ffff7ab946e <+78>: xchg %ax,%ax
0x00007ffff7ab9470 <+80>: lea 0x40(%rdi),%rcx
0x00007ffff7ab9474 <+84>: movdqu %xmm8,(%rdi)
0x00007ffff7ab9479 <+89>: and $0xffffffffffffffc0,%rcx
0x00007ffff7ab947d <+93>: movdqu %xmm8,-0x10(%rdi,%rdx,1)
0x00007ffff7ab9484 <+100>: movdqu %xmm8,0x10(%rdi)
0x00007ffff7ab948a <+106>: movdqu %xmm8,-0x20(%rdi,%rdx,1)
0x00007ffff7ab9491 <+113>: movdqu %xmm8,0x20(%rdi)
0x00007ffff7ab9497 <+119>: movdqu %xmm8,-0x30(%rdi,%rdx,1)
0x00007ffff7ab949e <+126>: movdqu %xmm8,0x30(%rdi)
0x00007ffff7ab94a4 <+132>: movdqu %xmm8,-0x40(%rdi,%rdx,1)
0x00007ffff7ab94ab <+139>: add %rdi,%rdx
0x00007ffff7ab94ae <+142>: and $0xffffffffffffffc0,%rdx
0x00007ffff7ab94b2 <+146>: cmp %rdx,%rcx
0x00007ffff7ab94b5 <+149>: je 0x7ffff7ab945a <memset+58>
0x00007ffff7ab94b7 <+151>: nopw 0x0(%rax,%rax,1)
0x00007ffff7ab94c0 <+160>: movdqa %xmm8,(%rcx)
0x00007ffff7ab94c5 <+165>: movdqa %xmm8,0x10(%rcx)
0x00007ffff7ab94cb <+171>: movdqa %xmm8,0x20(%rcx)
0x00007ffff7ab94d1 <+177>: movdqa %xmm8,0x30(%rcx)
0x00007ffff7ab94d7 <+183>: add $0x40,%rcx
0x00007ffff7ab94db <+187>: cmp %rcx,%rdx
0x00007ffff7ab94de <+190>: jne 0x7ffff7ab94c0 <memset+160>
0x00007ffff7ab94e0 <+192>: repz retq
0x00007ffff7ab94e2 <+194>: movq %xmm8,%rcx
0x00007ffff7ab94e7 <+199>: test $0x18,%dl
0x00007ffff7ab94ea <+202>: jne 0x7ffff7ab950e <memset+238>
0x00007ffff7ab94ec <+204>: test $0x4,%dl
0x00007ffff7ab94ef <+207>: jne 0x7ffff7ab9507 <memset+231>
0x00007ffff7ab94f1 <+209>: test $0x1,%dl
0x00007ffff7ab94f4 <+212>: je 0x7ffff7ab94f8 <memset+216>
0x00007ffff7ab94f6 <+214>: mov %cl,(%rdi)
0x00007ffff7ab94f8 <+216>: test $0x2,%dl
0x00007ffff7ab94fb <+219>: je 0x7ffff7ab945a <memset+58>
0x00007ffff7ab9501 <+225>: mov %cx,-0x2(%rax,%rdx,1)
0x00007ffff7ab9506 <+230>: retq
0x00007ffff7ab9507 <+231>: mov %ecx,(%rdi)
0x00007ffff7ab9509 <+233>: mov %ecx,-0x4(%rdi,%rdx,1)
0x00007ffff7ab950d <+237>: retq
0x00007ffff7ab950e <+238>: mov %rcx,(%rdi)
0x00007ffff7ab9511 <+241>: mov %rcx,-0x8(%rdi,%rdx,1)
0x00007ffff7ab9516 <+246>: retq
(This is in libc.so.6
, not the program itself -- the other person who tried to dump the assembly for memset
seems only to have found its PLT entry. The easiest way to get the assembly dump for the real memset
on a Unixy system is
$ gdb ./a.out
(gdb) set env LD_BIND_NOW t
(gdb) b main
Breakpoint 1 at [address]
(gdb) r
Breakpoint 1, [address] in main ()
(gdb) disas memset
...
.)