为什么添加额外的检查循环会在某些机器上产生很大的差异，而在其他机器上的差异很小？

我一直在做一些测试，看看在循环中检查的附加边界差异有多大。通过考虑访问数组时由C＃，Java等语言插入的隐式边界检查的成本来提示这一点。

更新：我在几台额外的计算机上尝试了相同的可执行程序，这会对正在发生的事情提供更多的启示。我首先列出了原始计算机，然后是我的现代笔记本电脑。在我的现代笔记本电脑上，在循环中添加额外的检查只会增加1到4％的时间，相比之下原始硬件的3到30％。

Processor x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz Ratio 2 checks : 1 check = 1.0310 Ratio 3 checks : 1 check = 1.2769 Processor Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s) Ratio 2 checks : 1 check = 1.0090 Ratio 3 checks : 1 check = 1.0393 Processor Intel(R) Core(TM) i5-2500 CPU @ 3.30GHz, 4 Cores(s) Ratio 2 checks : 1 check = 1.0035 Ratio 3 checks : 1 check = 1.0639 Processor Intel(R) Core(TM)2 Duo CPU T9300 @ 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s) Ratio 2 checks : 1 check = 1.1195 Ratio 3 checks : 1 check = 1.3597 Processor x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz Ratio 2 checks : 1 check = 1.0776 Ratio 3 checks : 1 check = 1.1451

在下面的测试程序中，第一个函数只检查一个边界，第二个函数检查两个，第三个检查三个（在调用代码中， n1=n2=n3 ）。我发现两个检查比例：一个是大约1.03， 三个检查的比率：一个是大约1.3。我更惊讶的是，添加一张支票对性能产生了很大的影响。我得到了一个有趣的答案，关于现代处理器的边界检查成本低于原始问题，这可能会对这里观察到的差异有所了解。

请注意，在未启用整个程序优化的情况下编译程序非常重要; 否则编译器可以简单地删除附加边界检查。

 // dotprod.cpp #include "dotprod.h" double SumProduct(const double* v1, const double* v2, int n) { double sum=0; for(int i=0; i<n; ++i) sum += v1[i]*v2[i]; return sum; } double SumProduct(const double* v1, const double* v2, int n1, int n2) { double sum=0; for(int i=0; i<n1 && i <n2; ++i) sum += v1[i]*v2[i]; return sum; } double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3) { double sum=0; for(int i=0; i<n1 && i <n2 && i <n3; ++i) sum += v1[i]*v2[i]; return sum; }

此代码最初是使用Visual Studio 2010，Release，Win32构建的（我添加了’C’标记，因为速度差异背后的原因可能不是C ++特定的，可能不是Windows特定的）。有谁能解释一下？

下面的其余代码，供参考。这里面有一些C ++特有的东西。

头文件

 // dotprod.h double SumProduct(const double*, const double*, int n); double SumProduct(const double*, const double*, int n1, int n2); double SumProduct(const double*, const double*, int n1, int n2, int n3);

测试线束

 // main.cpp #include  #include  #include  #include  #include  #include "../dotprod/dotprod.h" // separate lib typedef __int64 timecount_t; inline timecount_t GetTimeCount() { LARGE_INTEGER li; if (!QueryPerformanceCounter(&li)) { exit(1); } return li.QuadPart; } int main() { typedef std::vector dvec; const int N = 100 * 1000; // Initialize dvec v1(N); dvec v2(N); dvec dp1(N); dvec dp2(N); dvec dp3(N); for(int i=0; i<N; ++i) { v1[i] = i; v2[i] = log(static_cast(i+1)); } const timecount_t t0 = GetTimeCount(); // Check cost with one bound for(int n=0; n<N; ++n) { dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n); } const timecount_t t1 = GetTimeCount(); // Check cost with two bounds for(int n=0; n<N; ++n) { dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n); } const timecount_t t2 = GetTimeCount(); // Check cost with three bounds for(int n=0; n<N; ++n) { dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n); } const timecount_t t3 = GetTimeCount(); // Check results const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0); const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0); const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0); printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3); // Output timings const timecount_t elapsed1 = t1-t0; const timecount_t elapsed2 = t2-t1; const timecount_t elapsed3 = t3-t2; printf("Elapsed: %.0f, %.0f, %.0f\n", static_cast(elapsed1), static_cast(elapsed2), static_cast(elapsed3)); const double ratio2to1 = elapsed2 / static_cast(elapsed1); const double ratio3to1 = elapsed3 / static_cast(elapsed1); printf("Ratio 2:1=%.2f\n", ratio2to1); printf("Ratio 3:1=%.2f\n", ratio3to1); return 0; }

为了产生汇编，我在这个答案中接受了建议（案例2，关闭整个程序优化），生成以下asm文件。

 ; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp .686P .XMM include listing.inc .model flat INCLUDELIB OLDNAMES PUBLIC __real@0000000000000000 PUBLIC ?SumProduct@@YANPBN0HHH@Z ; SumProduct EXTRN __fltused:DWORD ; COMDAT __real@0000000000000000 ; File c:\dev\testspeed\dotprod\dotprod.cpp CONST SEGMENT __real@0000000000000000 DQ 00000000000000000r ; 0 ; Function compile flags: /Ogtp CONST ENDS ; COMDAT ?SumProduct@@YANPBN0HHH@Z _TEXT SEGMENT tv491 = -4 ; size = 4 _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 _n1$ = 16 ; size = 4 _n2$ = 20 ; size = 4 _n3$ = 24 ; size = 4 ?SumProduct@@YANPBN0HHH@Z PROC ; SumProduct, COMDAT ; 25 : { push ebp mov ebp, esp push ecx ; 26 : double sum=0; fldz push ebx mov ebx, DWORD PTR _v2$[ebp] push esi push edi mov edi, DWORD PTR _n1$[ebp] ; 27 : for(int i=0; xor ecx, ecx ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, 4 jl $LC8@SumProduct ; 26 : double sum=0; mov edi, DWORD PTR _v1$[ebp] lea esi, DWORD PTR [edi+24] ; 30 : sum += v1[i]*v2[i]; sub edi, ebx lea edx, DWORD PTR [ecx+2] lea eax, DWORD PTR [ebx+8] mov DWORD PTR tv491[ebp], edi $LN15@SumProduct: ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) mov ebx, DWORD PTR _n2$[ebp] cmp ecx, ebx jge $LN9@SumProduct cmp ecx, DWORD PTR _n3$[ebp] jge $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax-8] lea edi, DWORD PTR [edx-1] fmul QWORD PTR [esi-24] faddp ST(1), ST(0) cmp edi, ebx jge SHORT $LN9@SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; mov edi, DWORD PTR tv491[ebp] fld QWORD PTR [edi+eax] fmul QWORD PTR [eax] faddp ST(1), ST(0) cmp edx, ebx jge SHORT $LN9@SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edx, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+8] lea edi, DWORD PTR [edx+1] fmul QWORD PTR [esi-8] faddp ST(1), ST(0) cmp edi, ebx jge SHORT $LN9@SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+16] mov edi, DWORD PTR _n1$[ebp] fmul QWORD PTR [esi] add ecx, 4 lea ebx, DWORD PTR [edi-3] add eax, 32 ; 00000020H add esi, 32 ; 00000020H faddp ST(1), ST(0) add edx, 4 cmp ecx, ebx jl SHORT $LN15@SumProduct mov ebx, DWORD PTR _v2$[ebp] $LC8@SumProduct: ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp ecx, edi jge SHORT $LN9@SumProduct mov edx, DWORD PTR _v1$[ebp] lea eax, DWORD PTR [ebx+ecx*8] sub edx, ebx $LC3@SumProduct: cmp ecx, DWORD PTR _n2$[ebp] jge SHORT $LN9@SumProduct cmp ecx, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edx] inc ecx fmul QWORD PTR [eax] add eax, 8 faddp ST(1), ST(0) cmp ecx, edi jl SHORT $LC3@SumProduct $LN9@SumProduct: ; 31 : return sum; ; 32 : } pop edi pop esi pop ebx mov esp, ebp pop ebp ret 0 ?SumProduct@@YANPBN0HHH@Z ENDP ; SumProduct _TEXT ENDS PUBLIC ?SumProduct@@YANPBN0HH@Z ; SumProduct ; Function compile flags: /Ogtp ; COMDAT ?SumProduct@@YANPBN0HH@Z _TEXT SEGMENT tv448 = -4 ; size = 4 _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 _n1$ = 16 ; size = 4 _n2$ = 20 ; size = 4 ?SumProduct@@YANPBN0HH@Z PROC ; SumProduct, COMDAT ; 15 : { push ebp mov ebp, esp push ecx ; 16 : double sum=0; fldz push ebx mov ebx, DWORD PTR _v2$[ebp] push esi push edi mov edi, DWORD PTR _n1$[ebp] ; 17 : for(int i=0; xor ecx, ecx ; 18 : i<n1 && i <n2; ; 19 : ++i) cmp edi, 4 jl SHORT $LC8@SumProduct@2 ; 16 : double sum=0; mov edi, DWORD PTR _v1$[ebp] lea edx, DWORD PTR [edi+24] ; 20 : sum += v1[i]*v2[i]; sub edi, ebx lea esi, DWORD PTR [ecx+2] lea eax, DWORD PTR [ebx+8] mov DWORD PTR tv448[ebp], edi $LN19@SumProduct@2: mov edi, DWORD PTR _n2$[ebp] cmp ecx, edi jge SHORT $LN9@SumProduct@2 fld QWORD PTR [eax-8] lea ebx, DWORD PTR [esi-1] fmul QWORD PTR [edx-24] faddp ST(1), ST(0) cmp ebx, edi jge SHORT $LN9@SumProduct@2 mov ebx, DWORD PTR tv448[ebp] fld QWORD PTR [ebx+eax] fmul QWORD PTR [eax] faddp ST(1), ST(0) cmp esi, edi jge SHORT $LN9@SumProduct@2 fld QWORD PTR [eax+8] lea ebx, DWORD PTR [esi+1] fmul QWORD PTR [edx-8] faddp ST(1), ST(0) cmp ebx, edi jge SHORT $LN9@SumProduct@2 fld QWORD PTR [eax+16] mov edi, DWORD PTR _n1$[ebp] fmul QWORD PTR [edx] add ecx, 4 lea ebx, DWORD PTR [edi-3] add eax, 32 ; 00000020H add edx, 32 ; 00000020H faddp ST(1), ST(0) add esi, 4 cmp ecx, ebx jl SHORT $LN19@SumProduct@2 mov ebx, DWORD PTR _v2$[ebp] $LC8@SumProduct@2: ; 18 : i<n1 && i <n2; ; 19 : ++i) cmp ecx, edi jge SHORT $LN9@SumProduct@2 mov edx, DWORD PTR _v1$[ebp] lea eax, DWORD PTR [ebx+ecx*8] sub edx, ebx $LC3@SumProduct@2: cmp ecx, DWORD PTR _n2$[ebp] jge SHORT $LN9@SumProduct@2 ; 20 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edx] inc ecx fmul QWORD PTR [eax] add eax, 8 faddp ST(1), ST(0) cmp ecx, edi jl SHORT $LC3@SumProduct@2 $LN9@SumProduct@2: ; 21 : return sum; ; 22 : } pop edi pop esi pop ebx mov esp, ebp pop ebp ret 0 ?SumProduct@@YANPBN0HH@Z ENDP ; SumProduct _TEXT ENDS PUBLIC ?SumProduct@@YANPBN0H@Z ; SumProduct ; Function compile flags: /Ogtp ; COMDAT ?SumProduct@@YANPBN0H@Z _TEXT SEGMENT _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 ?SumProduct@@YANPBN0H@Z PROC ; SumProduct, COMDAT ; _n$ = eax ; 5 : { push ebp mov ebp, esp mov edx, DWORD PTR _v2$[ebp] ; 6 : double sum=0; fldz push ebx push esi mov esi, eax ; 7 : for(int i=0; xor ebx, ebx push edi mov edi, DWORD PTR _v1$[ebp] ; 8 : i<n; ; 9 : ++i) cmp esi, 4 jl SHORT $LC9@SumProduct@3 ; 6 : double sum=0; lea eax, DWORD PTR [edx+8] lea ecx, DWORD PTR [edi+24] ; 10 : sum += v1[i]*v2[i]; sub edi, edx lea edx, DWORD PTR [esi-4] shr edx, 2 inc edx lea ebx, DWORD PTR [edx*4] $LN10@SumProduct@3: fld QWORD PTR [eax-8] add eax, 32 ; 00000020H fmul QWORD PTR [ecx-24] add ecx, 32 ; 00000020H dec edx faddp ST(1), ST(0) fld QWORD PTR [edi+eax-32] fmul QWORD PTR [eax-32] faddp ST(1), ST(0) fld QWORD PTR [eax-24] fmul QWORD PTR [ecx-40] faddp ST(1), ST(0) fld QWORD PTR [eax-16] fmul QWORD PTR [ecx-32] faddp ST(1), ST(0) jne SHORT $LN10@SumProduct@3 ; 6 : double sum=0; mov edx, DWORD PTR _v2$[ebp] mov edi, DWORD PTR _v1$[ebp] $LC9@SumProduct@3: ; 8 : i<n; ; 9 : ++i) cmp ebx, esi jge SHORT $LN8@SumProduct@3 sub edi, edx lea eax, DWORD PTR [edx+ebx*8] sub esi, ebx $LC3@SumProduct@3: ; 10 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edi] add eax, 8 dec esi fmul QWORD PTR [eax-8] faddp ST(1), ST(0) jne SHORT $LC3@SumProduct@3 $LN8@SumProduct@3: ; 11 : return sum; ; 12 : } pop edi pop esi pop ebx pop ebp ret 0 ?SumProduct@@YANPBN0H@Z ENDP ; SumProduct _TEXT ENDS END

CPU之间的一个重要区别是管道优化

CPU可以并行执行多条指令，直到到达条件分支。从这一点开始，CPU不再等待所有指令执行，而是可以并行继续分支，直到条件可用并准备好进行评估。如果假设是正确的，那么我们就有了收益。否则CPU将与另一个分支一起使用。

因此，CPU的棘手部分是找到最佳假设并尽可能并行地执行尽可能多的指令。

为什么添加额外的检查循环会在某些机器上产生很大的差异，而在其他机器上的差异很小？

错误代码枚举的C命名建议

什么是C / C ++中最好的计算机视觉库？

检测mp4文件

为什么“i”变量在我的程序中增加两倍？

比较运算符在Python与C / C ++中的优先级

函数中* p和（* p）之间有什么区别？

为什么malloc（0）返回有效的内存地址？什么用途？

避免fork（）/ SIGCHLD竞争条件

创建数组的动态声明

平均3个整数

为什么添加额外的检查循环会在某些机器上产生很大的差异，而在其他机器上的差异很小？

错误代码枚举的C命名建议

什么是C / C ++中最好的计算机视觉库？

检测mp4文件

为什么“i”变量在我的程序中增加两倍？

比较运算符在Python与C / C ++中的优先级

函数中* p和（* p）之间有什么区别？

为什么malloc（0）返回有效的内存地址？ 什么用途？

避免fork（）/ SIGCHLD竞争条件

创建数组的动态声明

平均3个整数

为什么malloc（0）返回有效的内存地址？什么用途？