为什么添加额外的检查循环会在某些机器上产生很大的差异,而在其他机器上的差异很小?

我一直在做一些测试,看看在循环中检查的附加边界差异有多大。 通过考虑访问数组时由C#,Java等语言插入的隐式边界检查的成本来提示这一点。

更新:我在几台额外的计算机上尝试了相同的可执行程序,这会对正在发生的事情提供更多的启示。 我首先列出了原始计算机,然后是我的现代笔记本电脑。 在我的现代笔记本电脑上,在循环中添加额外的检查只会增加1到4%的时间,相比之下原始硬件的3到30%。

Processor x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz Ratio 2 checks : 1 check = 1.0310 Ratio 3 checks : 1 check = 1.2769 Processor Intel(R) Core(TM) i7-3610QM CPU @ 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s) Ratio 2 checks : 1 check = 1.0090 Ratio 3 checks : 1 check = 1.0393 Processor Intel(R) Core(TM) i5-2500 CPU @ 3.30GHz, 4 Cores(s) Ratio 2 checks : 1 check = 1.0035 Ratio 3 checks : 1 check = 1.0639 Processor Intel(R) Core(TM)2 Duo CPU T9300 @ 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s) Ratio 2 checks : 1 check = 1.1195 Ratio 3 checks : 1 check = 1.3597 Processor x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz Ratio 2 checks : 1 check = 1.0776 Ratio 3 checks : 1 check = 1.1451 

在下面的测试程序中,第一个函数只检查一个边界,第二个函数检查两个,第三个检查三个(在调用代码中, n1=n2=n3 )。 我发现两个检查比例:一个是大约1.03, 三个检查的比率:一个是大约1.3。 我更惊讶的是,添加一张支票对性能产生了很大的影响。 我得到了一个有趣的答案,关于现代处理器的边界检查成本低于原始问题,这可能会对这里观察到的差异有所了解。

请注意,在未启用整个程序优化的情况下编译程序非常重要; 否则编译器可以简单地删除附加边界检查。

 // dotprod.cpp #include "dotprod.h" double SumProduct(const double* v1, const double* v2, int n) { double sum=0; for(int i=0; i<n; ++i) sum += v1[i]*v2[i]; return sum; } double SumProduct(const double* v1, const double* v2, int n1, int n2) { double sum=0; for(int i=0; i<n1 && i <n2; ++i) sum += v1[i]*v2[i]; return sum; } double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3) { double sum=0; for(int i=0; i<n1 && i <n2 && i <n3; ++i) sum += v1[i]*v2[i]; return sum; } 

此代码最初是使用Visual Studio 2010,Release,Win32构建的(我添加了’C’标记,因为速度差异背后的原因可能不是C ++特定的,可能不是Windows特定的)。 有谁能解释一下?

下面的其余代码,供参考。 这里面有一些C ++特有的东西。

头文件

 // dotprod.h double SumProduct(const double*, const double*, int n); double SumProduct(const double*, const double*, int n1, int n2); double SumProduct(const double*, const double*, int n1, int n2, int n3); 

测试线束

 // main.cpp #include  #include  #include  #include  #include  #include "../dotprod/dotprod.h" // separate lib typedef __int64 timecount_t; inline timecount_t GetTimeCount() { LARGE_INTEGER li; if (!QueryPerformanceCounter(&li)) { exit(1); } return li.QuadPart; } int main() { typedef std::vector dvec; const int N = 100 * 1000; // Initialize dvec v1(N); dvec v2(N); dvec dp1(N); dvec dp2(N); dvec dp3(N); for(int i=0; i<N; ++i) { v1[i] = i; v2[i] = log(static_cast(i+1)); } const timecount_t t0 = GetTimeCount(); // Check cost with one bound for(int n=0; n<N; ++n) { dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n); } const timecount_t t1 = GetTimeCount(); // Check cost with two bounds for(int n=0; n<N; ++n) { dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n); } const timecount_t t2 = GetTimeCount(); // Check cost with three bounds for(int n=0; n<N; ++n) { dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n); } const timecount_t t3 = GetTimeCount(); // Check results const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0); const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0); const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0); printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3); // Output timings const timecount_t elapsed1 = t1-t0; const timecount_t elapsed2 = t2-t1; const timecount_t elapsed3 = t3-t2; printf("Elapsed: %.0f, %.0f, %.0f\n", static_cast(elapsed1), static_cast(elapsed2), static_cast(elapsed3)); const double ratio2to1 = elapsed2 / static_cast(elapsed1); const double ratio3to1 = elapsed3 / static_cast(elapsed1); printf("Ratio 2:1=%.2f\n", ratio2to1); printf("Ratio 3:1=%.2f\n", ratio3to1); return 0; } 

为了产生汇编,我在这个答案中接受了建议(案例2,关闭整个程序优化),生成以下asm文件。

 ; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp .686P .XMM include listing.inc .model flat INCLUDELIB OLDNAMES PUBLIC __real@0000000000000000 PUBLIC ?SumProduct@@YANPBN0HHH@Z ; SumProduct EXTRN __fltused:DWORD ; COMDAT __real@0000000000000000 ; File c:\dev\testspeed\dotprod\dotprod.cpp CONST SEGMENT __real@0000000000000000 DQ 00000000000000000r ; 0 ; Function compile flags: /Ogtp CONST ENDS ; COMDAT ?SumProduct@@YANPBN0HHH@Z _TEXT SEGMENT tv491 = -4 ; size = 4 _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 _n1$ = 16 ; size = 4 _n2$ = 20 ; size = 4 _n3$ = 24 ; size = 4 ?SumProduct@@YANPBN0HHH@Z PROC ; SumProduct, COMDAT ; 25 : { push ebp mov ebp, esp push ecx ; 26 : double sum=0; fldz push ebx mov ebx, DWORD PTR _v2$[ebp] push esi push edi mov edi, DWORD PTR _n1$[ebp] ; 27 : for(int i=0; xor ecx, ecx ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, 4 jl $LC8@SumProduct ; 26 : double sum=0; mov edi, DWORD PTR _v1$[ebp] lea esi, DWORD PTR [edi+24] ; 30 : sum += v1[i]*v2[i]; sub edi, ebx lea edx, DWORD PTR [ecx+2] lea eax, DWORD PTR [ebx+8] mov DWORD PTR tv491[ebp], edi $LN15@SumProduct: ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) mov ebx, DWORD PTR _n2$[ebp] cmp ecx, ebx jge $LN9@SumProduct cmp ecx, DWORD PTR _n3$[ebp] jge $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax-8] lea edi, DWORD PTR [edx-1] fmul QWORD PTR [esi-24] faddp ST(1), ST(0) cmp edi, ebx jge SHORT $LN9@SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; mov edi, DWORD PTR tv491[ebp] fld QWORD PTR [edi+eax] fmul QWORD PTR [eax] faddp ST(1), ST(0) cmp edx, ebx jge SHORT $LN9@SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edx, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+8] lea edi, DWORD PTR [edx+1] fmul QWORD PTR [esi-8] faddp ST(1), ST(0) cmp edi, ebx jge SHORT $LN9@SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+16] mov edi, DWORD PTR _n1$[ebp] fmul QWORD PTR [esi] add ecx, 4 lea ebx, DWORD PTR [edi-3] add eax, 32 ; 00000020H add esi, 32 ; 00000020H faddp ST(1), ST(0) add edx, 4 cmp ecx, ebx jl SHORT $LN15@SumProduct mov ebx, DWORD PTR _v2$[ebp] $LC8@SumProduct: ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp ecx, edi jge SHORT $LN9@SumProduct mov edx, DWORD PTR _v1$[ebp] lea eax, DWORD PTR [ebx+ecx*8] sub edx, ebx $LC3@SumProduct: cmp ecx, DWORD PTR _n2$[ebp] jge SHORT $LN9@SumProduct cmp ecx, DWORD PTR _n3$[ebp] jge SHORT $LN9@SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edx] inc ecx fmul QWORD PTR [eax] add eax, 8 faddp ST(1), ST(0) cmp ecx, edi jl SHORT $LC3@SumProduct $LN9@SumProduct: ; 31 : return sum; ; 32 : } pop edi pop esi pop ebx mov esp, ebp pop ebp ret 0 ?SumProduct@@YANPBN0HHH@Z ENDP ; SumProduct _TEXT ENDS PUBLIC ?SumProduct@@YANPBN0HH@Z ; SumProduct ; Function compile flags: /Ogtp ; COMDAT ?SumProduct@@YANPBN0HH@Z _TEXT SEGMENT tv448 = -4 ; size = 4 _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 _n1$ = 16 ; size = 4 _n2$ = 20 ; size = 4 ?SumProduct@@YANPBN0HH@Z PROC ; SumProduct, COMDAT ; 15 : { push ebp mov ebp, esp push ecx ; 16 : double sum=0; fldz push ebx mov ebx, DWORD PTR _v2$[ebp] push esi push edi mov edi, DWORD PTR _n1$[ebp] ; 17 : for(int i=0; xor ecx, ecx ; 18 : i<n1 && i <n2; ; 19 : ++i) cmp edi, 4 jl SHORT $LC8@SumProduct@2 ; 16 : double sum=0; mov edi, DWORD PTR _v1$[ebp] lea edx, DWORD PTR [edi+24] ; 20 : sum += v1[i]*v2[i]; sub edi, ebx lea esi, DWORD PTR [ecx+2] lea eax, DWORD PTR [ebx+8] mov DWORD PTR tv448[ebp], edi $LN19@SumProduct@2: mov edi, DWORD PTR _n2$[ebp] cmp ecx, edi jge SHORT $LN9@SumProduct@2 fld QWORD PTR [eax-8] lea ebx, DWORD PTR [esi-1] fmul QWORD PTR [edx-24] faddp ST(1), ST(0) cmp ebx, edi jge SHORT $LN9@SumProduct@2 mov ebx, DWORD PTR tv448[ebp] fld QWORD PTR [ebx+eax] fmul QWORD PTR [eax] faddp ST(1), ST(0) cmp esi, edi jge SHORT $LN9@SumProduct@2 fld QWORD PTR [eax+8] lea ebx, DWORD PTR [esi+1] fmul QWORD PTR [edx-8] faddp ST(1), ST(0) cmp ebx, edi jge SHORT $LN9@SumProduct@2 fld QWORD PTR [eax+16] mov edi, DWORD PTR _n1$[ebp] fmul QWORD PTR [edx] add ecx, 4 lea ebx, DWORD PTR [edi-3] add eax, 32 ; 00000020H add edx, 32 ; 00000020H faddp ST(1), ST(0) add esi, 4 cmp ecx, ebx jl SHORT $LN19@SumProduct@2 mov ebx, DWORD PTR _v2$[ebp] $LC8@SumProduct@2: ; 18 : i<n1 && i <n2; ; 19 : ++i) cmp ecx, edi jge SHORT $LN9@SumProduct@2 mov edx, DWORD PTR _v1$[ebp] lea eax, DWORD PTR [ebx+ecx*8] sub edx, ebx $LC3@SumProduct@2: cmp ecx, DWORD PTR _n2$[ebp] jge SHORT $LN9@SumProduct@2 ; 20 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edx] inc ecx fmul QWORD PTR [eax] add eax, 8 faddp ST(1), ST(0) cmp ecx, edi jl SHORT $LC3@SumProduct@2 $LN9@SumProduct@2: ; 21 : return sum; ; 22 : } pop edi pop esi pop ebx mov esp, ebp pop ebp ret 0 ?SumProduct@@YANPBN0HH@Z ENDP ; SumProduct _TEXT ENDS PUBLIC ?SumProduct@@YANPBN0H@Z ; SumProduct ; Function compile flags: /Ogtp ; COMDAT ?SumProduct@@YANPBN0H@Z _TEXT SEGMENT _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 ?SumProduct@@YANPBN0H@Z PROC ; SumProduct, COMDAT ; _n$ = eax ; 5 : { push ebp mov ebp, esp mov edx, DWORD PTR _v2$[ebp] ; 6 : double sum=0; fldz push ebx push esi mov esi, eax ; 7 : for(int i=0; xor ebx, ebx push edi mov edi, DWORD PTR _v1$[ebp] ; 8 : i<n; ; 9 : ++i) cmp esi, 4 jl SHORT $LC9@SumProduct@3 ; 6 : double sum=0; lea eax, DWORD PTR [edx+8] lea ecx, DWORD PTR [edi+24] ; 10 : sum += v1[i]*v2[i]; sub edi, edx lea edx, DWORD PTR [esi-4] shr edx, 2 inc edx lea ebx, DWORD PTR [edx*4] $LN10@SumProduct@3: fld QWORD PTR [eax-8] add eax, 32 ; 00000020H fmul QWORD PTR [ecx-24] add ecx, 32 ; 00000020H dec edx faddp ST(1), ST(0) fld QWORD PTR [edi+eax-32] fmul QWORD PTR [eax-32] faddp ST(1), ST(0) fld QWORD PTR [eax-24] fmul QWORD PTR [ecx-40] faddp ST(1), ST(0) fld QWORD PTR [eax-16] fmul QWORD PTR [ecx-32] faddp ST(1), ST(0) jne SHORT $LN10@SumProduct@3 ; 6 : double sum=0; mov edx, DWORD PTR _v2$[ebp] mov edi, DWORD PTR _v1$[ebp] $LC9@SumProduct@3: ; 8 : i<n; ; 9 : ++i) cmp ebx, esi jge SHORT $LN8@SumProduct@3 sub edi, edx lea eax, DWORD PTR [edx+ebx*8] sub esi, ebx $LC3@SumProduct@3: ; 10 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edi] add eax, 8 dec esi fmul QWORD PTR [eax-8] faddp ST(1), ST(0) jne SHORT $LC3@SumProduct@3 $LN8@SumProduct@3: ; 11 : return sum; ; 12 : } pop edi pop esi pop ebx pop ebp ret 0 ?SumProduct@@YANPBN0H@Z ENDP ; SumProduct _TEXT ENDS END 

CPU之间的一个重要区别是管道优化

CPU可以并行执行多条指令,直到到达条件分支。 从这一点开始,CPU不再等待所有指令执行,而是可以并行继续分支,直到条件可用并准备好进行评估。 如果假设是正确的,那么我们就有了收益。 否则CPU将与另一个分支一起使用。

因此,CPU的棘手部分是找到最佳假设并尽可能并行地执行尽可能多的指令。