最少组装或编译至少三个值

我正在查看GCC-4.8为x86_64生成的代码,并想知道是否有更好(更快)的方法来计算三个值的最小值。

这是Python的集合模块的摘录,它计算mrightindex+1leftindex的最小值:

  ssize_t m = n; if (m > rightindex + 1) m = rightindex + 1; if (m > leftindex) m = leftindex; 

GCC使用CMOV生成连续相关的代码:

 leaq 1(%rbp), %rdx cmpq %rsi, %rdx cmovg %rsi, %rdx cmpq %rbx, %rdx cmovg %rbx, %rdx 

是否有更快的代码可以通过删除数据依赖性来利用处理器无序并行执行? 我想知道是否存在用于计算多个值的最小值而不使用条件或谓词指令的已知技巧。 我也想知道是否有一些饱和的算术内在函数可以帮助解决这种情况。

EDITS:

  • 如图所示,代码使用带符号的算术,但无符号算术答案也会有所帮助。
  • 我询问了最少三个,但也对n最小的n感兴趣。
  • Linus对CMOV的警告: http : //ondioline.org/mail/cmov-a-bad-idea-on-out-of-order-cpus

最少两个无符号数具有经典解决方案:

 ; eax = min(eax, ebx), ecx - scratch register. .min2: sub ebx, eax sbb ecx, ecx and ecx, ebx add eax, ecx 

这种方法可能比使用cmov的解决方案更快,但是为了更高的速度,指令必须由其他指令分开以进行并行执行。

可以为三个数字实现此方法:

 ; eax = min(eax, ebx, edx), ecx - scratch register. .min3: sub ebx, eax sbb ecx, ecx and ecx, ebx add eax, ecx sub edx, eax sbb ecx, ecx and ecx, edx add eax, ecx 

另一种尝试是使用条件跳转来测试变体。 对于现代处理器,它可能更快,特别是如果跳跃是高度可预测的:

 .min3: cmp eax, ebx jle @f mov eax, ebx @@: cmp eax, edx jle @f mov eax, edx @@: 

以下是建议方法的基准测试结果。 处理器是Intel Core-i7 2600K,运行频率为4GHz。 每个循环的单位是纳秒(越小越好)。 测量包括伪随机测试数据生成和函数调用开销,以及测试中的实际min3代码。

  gcc cmov conditional classical sequence branches branchless pseudo-random data 2.24 6.31 2.39 fixed data 2.24 2.99 2.39 

基准源代码

functions.s:评估的3个解决方案:

 //---------------------------------------------------------------------------- .text .intel_syntax noprefix //----------------------------------------------------------------------------- // uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_a min3_a: mov rax, rcx cmp rax, rdx cmovg rax, rdx cmp rax, r8 cmovg rax, r8 retq //----------------------------------------------------------------------------- // uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_b min3_b: mov rax, rcx cmp rax, rdx jle skip1 mov rax, rdx skip1: cmp rax, r8 jle skip2 mov rax, r8 skip2: retq //----------------------------------------------------------------------------- // uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8) .globl min3_c min3_c: sub rdx, rcx sbb rax, rax and rax, rdx add rcx, rax sub r8, rcx sbb rax, rax and rax, r8 add rax, rcx retq //----------------------------------------------------------------------------- 

min3test.c,主程序(mingw64 / windows):

 #define __USE_MINGW_ANSI_STDIO 1 #include  #include  #include  uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8); uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8); uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8); //----------------------------------------------------------------------------- // // queryPerformanceCounter - similar to QueryPerformanceCounter, but returns // count directly. uint64_t queryPerformanceCounter (void) { LARGE_INTEGER int64; QueryPerformanceCounter (&int64); return int64.QuadPart; } //----------------------------------------------------------------------------- // // queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly. uint64_t queryPerformanceFrequency (void) { LARGE_INTEGER int64; QueryPerformanceFrequency (&int64); return int64.QuadPart; } //---------------------------------------------------------------------------- // // lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation // static uint64_t lfsr64gpr (uint64_t data, uint64_t mask) { uint64_t carryOut = data >> 63; uint64_t maskOrZ = -carryOut; return (data << 1) ^ (maskOrZ & mask); } //--------------------------------------------------------------------------- uint64_t min3 (uint64_t a, uint64_t b, uint64_t c) { uint64_t smallest; smallest = a; if (smallest > b) smallest = b; if (smallest > c) smallest = c; return smallest; } //--------------------------------------------------------------------------- static void runtests (uint64_t pattern, uint64_t mask) { uint64_t startCount, elapsed, index, loops = 800000000; double ns; startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); startCount = queryPerformanceCounter (); for (index = 0; index < loops; index++) { pattern = lfsr64gpr (pattern, mask); min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF); } elapsed = queryPerformanceCounter () - startCount; ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops; printf ("%7.2f ns\n", ns); } //--------------------------------------------------------------------------- int main (void) { uint64_t index, pattern, mask; uint64_t count_a = 0, count_b = 0, count_c = 0; mask = 0xBEFFFFFFFFFFFFFF; pattern = 1; // sanity check the asm functions for (index = 0; index < 1000000; index++) { uint64_t expected, result_a, result_b, result_c; uint64_t pattern1 = (pattern >> 0) & 0xFFFFF; uint64_t pattern2 = (pattern >> 20) & 0xFFFFF; uint64_t pattern3 = (pattern >> 40) & 0xFFFFF; pattern = lfsr64gpr (pattern, mask); expected = min3 (pattern1, pattern2, pattern3); result_a = min3_a (pattern1, pattern2, pattern3); result_b = min3_b (pattern1, pattern2, pattern3); result_c = min3_c (pattern1, pattern2, pattern3); if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3); if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3); if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3); if (expected == pattern1) count_a++; if (result_b == pattern2) count_b++; if (result_c == pattern3) count_c++; } printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c); // raise our priority to increase measurement accuracy SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS); printf ("using pseudo-random data\n"); runtests (1, mask); printf ("using fixed data\n"); runtests (0, mask); return 0; } //--------------------------------------------------------------------------- 

构建命令行:

 gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s 

函数min(x,y,z)是连续的,但其导数不是。 该衍生物在其定义的任何地方都有规范1。 没有办法将其表达为算术函数。

饱和算术有其自身的不连续性,因此在这种情况下不能使用先前的推理。 但是,饱和点与输入无关。 这反过来意味着你需要扩展输入,此时我相信结果代码不会更快。

这当然不能完全certificate不存在更快的代码,但你可能需要对其进行详尽的搜索。