Here are benchmark results for the methods suggested. Processor is Intel Core-i7 2600K running at 4GHz. Units are nanoseconds per loop (smaller is better). Measurements include pseudo-random test data generation and function call overhead, in addition to the actual min3 code under test.
gcc cmov conditional classical
sequence branches branchless
pseudo-random data 2.24 6.31 2.39
fixed data 2.24 2.99 2.39
Benchmark source code
functions.s: the 3 solutions evaluated:
//----------------------------------------------------------------------------
.text
.intel_syntax noprefix
//-----------------------------------------------------------------------------
// uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_a
min3_a:
mov rax, rcx
cmp rax, rdx
cmovg rax, rdx
cmp rax, r8
cmovg rax, r8
retq
//-----------------------------------------------------------------------------
// uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_b
min3_b:
mov rax, rcx
cmp rax, rdx
jle skip1
mov rax, rdx
skip1:
cmp rax, r8
jle skip2
mov rax, r8
skip2:
retq
//-----------------------------------------------------------------------------
// uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_c
min3_c:
sub rdx, rcx
sbb rax, rax
and rax, rdx
add rcx, rax
sub r8, rcx
sbb rax, rax
and rax, r8
add rax, rcx
retq
//-----------------------------------------------------------------------------
min3test.c, main program (mingw64/windows):
#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>
uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8);
//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.
uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.
uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;
QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}
//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
{
uint64_t carryOut = data >> 63;
uint64_t maskOrZ = -carryOut;
return (data << 1) ^ (maskOrZ & mask);
}
//---------------------------------------------------------------------------
uint64_t min3 (uint64_t a, uint64_t b, uint64_t c)
{
uint64_t smallest;
smallest = a;
if (smallest > b) smallest = b;
if (smallest > c) smallest = c;
return smallest;
}
//---------------------------------------------------------------------------
static void runtests (uint64_t pattern, uint64_t mask)
{
uint64_t startCount, elapsed, index, loops = 800000000;
double ns;
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns
", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns
", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns
", ns);
}
//---------------------------------------------------------------------------
int main (void)
{
uint64_t index, pattern, mask;
uint64_t count_a = 0, count_b = 0, count_c = 0;
mask = 0xBEFFFFFFFFFFFFFF;
pattern = 1;
// sanity check the asm functions
for (index = 0; index < 1000000; index++)
{
uint64_t expected, result_a, result_b, result_c;
uint64_t pattern1 = (pattern >> 0) & 0xFFFFF;
uint64_t pattern2 = (pattern >> 20) & 0xFFFFF;
uint64_t pattern3 = (pattern >> 40) & 0xFFFFF;
pattern = lfsr64gpr (pattern, mask);
expected = min3 (pattern1, pattern2, pattern3);
result_a = min3_a (pattern1, pattern2, pattern3);
result_b = min3_b (pattern1, pattern2, pattern3);
result_c = min3_c (pattern1, pattern2, pattern3);
if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu
", expected, result_a, pattern1, pattern2, pattern3);
if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu
", expected, result_b, pattern1, pattern2, pattern3);
if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu
", expected, result_c, pattern1, pattern2, pattern3);
if (expected == pattern1) count_a++;
if (result_b == pattern2) count_b++;
if (result_c == pattern3) count_c++;
}
printf ("pseudo-random distribution: %llu, %llu, %llu
", count_a, count_b, count_c);
// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);
printf ("using pseudo-random data
");
runtests (1, mask);
printf ("using fixed data
");
runtests (0, mask);
return 0;
}
//---------------------------------------------------------------------------
build command line:
gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s