重叠数组的总和，自动矢量化和限制

Arstechnia最近有一篇文章为什么一些编程语言比其他语言更快。它比较了Fortran和C，并提到了求和数组。在Fortran中，假设数组不重叠，因此可以进一步优化。在C / C ++中，指向相同类型的指针可能会重叠，因此通常不能使用此优化。但是，在C / C ++中，可以使用restrict或__restrict关键字告诉编译器不要假设指针重叠。所以我开始研究自动矢量化。

以下代码在GCC和MSVC中进行矢量化

 void dot_int(int *a, int *b, int *c, int n) { for(int i=0; i<n; i++) { c[i] = a[i] + b[i]; } }

我使用和不使用重叠数组测试了它，它得到了正确的结果。但是，我使用SSE手动向量化循环的方式不能处理重叠数组。

 int i=0; for(; i<n-3; i+=4) { __m128i a4 = _mm_loadu_si128((__m128i*)&a[i]); __m128i b4 = _mm_loadu_si128((__m128i*)&b[i]); __m128i c4 = _mm_add_epi32(a4,b4); _mm_storeu_si128((__m128i*)c, c4); } for(; i<n; i++) { c[i] = a[i] + b[i]; }

接下来我尝试使用__restrict 。我假设由于编译器可以假设数组不重叠，它不会处理重叠数组，但GCC和MSVC仍然可以获得重叠数组的正确结果，即使使用__restrict 。

 void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) { for(int i=0; i<n; i++) { c[i] = a[i] + b[i]; } }

为什么带有和不带__restrict的自动向量化代码会为重叠数组获得正确的结果？ 。

这是我用来测试这个的完整代码：

 #include  #include  void dot_int(int *a, int *b, int *c, int n) { for(int i=0; i<n; i++) { c[i] = a[i] + b[i]; } for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n"); } void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) { for(int i=0; i<n; i++) { c[i] = a[i] + b[i]; } for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n"); } void dot_int_SSE(int *a, int *b, int *c, int n) { int i=0; for(; i<n-3; i+=4) { __m128i a4 = _mm_loadu_si128((__m128i*)&a[i]); __m128i b4 = _mm_loadu_si128((__m128i*)&b[i]); __m128i c4 = _mm_add_epi32(a4,b4); _mm_storeu_si128((__m128i*)c, c4); } for(; i<n; i++) { c[i] = a[i] + b[i]; } for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n"); } int main() { const int n = 100; int a[] = {1,1,1,1,1,1,1,1}; int b1[] = {1,1,1,1,1,1,1,1,1}; int b2[] = {1,1,1,1,1,1,1,1,1}; int b3[] = {1,1,1,1,1,1,1,1,1}; int c[8]; int *c1 = &b1[1]; int *c2 = &b2[1]; int *c3 = &b3[1]; dot_int(a,b1,c, 8); dot_int_SSE(a,b1,c,8); dot_int(a,b1,c1, 8); dot_int_restrict(a,b2,c2,8); dot_int_SSE(a,b3,c3,8); }

输出（来自MSVC）

 2 2 2 2 2 2 2 2 //no overlap default 2 2 2 2 2 2 2 2 //no overlap with manual SSE vector code 2 3 4 5 6 7 8 9 //overlap default 2 3 4 5 6 7 8 9 //overlap with restrict 3 2 2 2 1 1 1 1 //manual SSE vector code

编辑：

这是另一个插入版本，它产生更简单的代码

 void dot_int(int * __restrict a, int * __restrict b, int * __restrict c, int n) { a = (int*)__builtin_assume_aligned (a, 16); b = (int*)__builtin_assume_aligned (b, 16); c = (int*)__builtin_assume_aligned (c, 16); for(int i=0; i<n; i++) { c[i] = a[i] + b[i]; } }

我不知道问题是什么。在Linux / 64位，GCC 4.6，-O3，-mtune = native，-msse4.1（即一个非常旧的编译器/系统）上进行测试，此代码

 void dot_int(int *a, int *b, int *c, int n) { for(int i=0; i


 编译到这个内循环： 
 .L4: movdqu (%rdi,%rax), %xmm1 addl $1, %r8d movdqu (%rsi,%rax), %xmm0 paddd %xmm1, %xmm0 movdqu %xmm0, (%rdx,%rax) addq $16, %rax cmpl %r8d, %r10d ja .L4 cmpl %r9d, %ecx je .L1 
 虽然这个代码 
 void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) { for(int i=0; i 

 汇编到这个： 
 .L15: movdqu (%rbx,%rax), %xmm0 addl $1, %r8d paddd 0(%rbp,%rax), %xmm0 movdqu %xmm0, (%r11,%rax) addq $16, %rax cmpl %r10d, %r8d jb .L15 addl %r12d, %r9d cmpl %r12d, %r13d je .L10 
 你可以清楚地看到减少一个负载。 我猜它的估计是在执行总和之前没有必要明确地加载内存，因为结果不会覆盖任何东西。 
 还有更多优化的空间 -  GCC不知道参数是128位对齐的，因此它必须生成一个巨大的前导码来检查没有对齐问题（YMMV），以及一个处理额外未对齐的后缀部分（或不到128位）。 这实际上发生在上面的两个版本。 这是为dot_int生成的完整代码： 
 dot_int: .LFB626: .cfi_startproc testl %ecx, %ecx pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 jle .L1 leaq 16(%rdx), %r11 movl %ecx, %r10d shrl $2, %r10d leal 0(,%r10,4), %r9d testl %r9d, %r9d je .L6 leaq 16(%rdi), %rax cmpl $6, %ecx seta %r8b cmpq %rax, %rdx seta %al cmpq %r11, %rdi seta %bl orl %ebx, %eax andl %eax, %r8d leaq 16(%rsi), %rax cmpq %rax, %rdx seta %al cmpq %r11, %rsi seta %r11b orl %r11d, %eax testb %al, %r8b je .L6 xorl %eax, %eax xorl %r8d, %r8d .p2align 4,,10 .p2align 3 .L4: movdqu (%rdi,%rax), %xmm1 addl $1, %r8d movdqu (%rsi,%rax), %xmm0 paddd %xmm1, %xmm0 movdqu %xmm0, (%rdx,%rax) addq $16, %rax cmpl %r8d, %r10d ja .L4 cmpl %r9d, %ecx je .L1 .L3: movslq %r9d, %r8 xorl %eax, %eax salq $2, %r8 addq %r8, %rdx addq %r8, %rdi addq %r8, %rsi .p2align 4,,10 .p2align 3 .L5: movl (%rdi,%rax,4), %r8d addl (%rsi,%rax,4), %r8d movl %r8d, (%rdx,%rax,4) addq $1, %rax leal (%r9,%rax), %r8d cmpl %r8d, %ecx jg .L5 .L1: popq %rbx .cfi_remember_state .cfi_def_cfa_offset 8 ret .L6: .cfi_restore_state xorl %r9d, %r9d jmp .L3 .cfi_endproc 
 现在在你的情况下，int有效地不对齐（因为它们在堆栈中），但是如果你可以使它们对齐并告诉GCC，那么你可以改进代码生成： 
 typedef int intvec __attribute__((vector_size(16))); void dot_int_restrict_alig(intvec * restrict a, intvec * restrict b, intvec * restrict c, unsigned int n) { for(unsigned int i=0; i 

 这会生成此代码，没有前导码： 
 dot_int_restrict_alig: .LFB628: .cfi_startproc testl %ecx, %ecx je .L23 subl $1, %ecx xorl %eax, %eax addq $1, %rcx salq $4, %rcx .p2align 4,,10 .p2align 3 .L25: movdqa (%rdi,%rax), %xmm0 paddd (%rsi,%rax), %xmm0 movdqa %xmm0, (%rdx,%rax) addq $16, %rax cmpq %rcx, %rax jne .L25 .L23: rep ret .cfi_endproc 
 注意使用对齐的128位加载指令（ movdqa ， a as aligned，vs movdqu ， movdqu ）。



		      	 如果对重叠数组使用“restrict”，则会得到未定义的行为。 这就是你在“重叠限制”情况下得到的结果。 未定义的行为意味着任何事情都会发生。 它确实如此。 巧合的是，行为与没有“限制”的行为相同。 完全正确。 它完全符合“任何可能发生的事情”的定义。 没什么可抱怨的。 



		      	 我想我明白现在发生了什么。 事实certificate，MSVC和GCC使用__restrict给出了不同的结果。  MSVC得到重叠的正确答案，GCC没有。 我认为可以得出结论，MSVC忽略了__restrict关键字，而GCC正在使用它来进一步优化。 
 海湾合作委员会的产出 
 2 2 2 2 2 2 2 2 //no overlap default 2 2 2 2 2 2 2 2 //no overlap with manual SSE vector code 2 3 4 5 6 7 8 9 //overlap without __restrict 2 2 2 2 3 2 2 2 //overlap with __restrict 3 2 2 2 1 1 1 1 //manual SSE vector code 
 我们可以生成一个纯矢量化函数，它提供与C一样多的assembly线（所有代码都是用GCC 4.9.0中的-O3生成的）： 
 void dot_int(int * __ restrict a, int * __restrict b, int * __restrict c) { a = (int*)__builtin_assume_aligned (a, 16); b = (int*)__builtin_assume_aligned (b, 16); c = (int*)__builtin_assume_aligned (c, 16); for(int i=0; i<1024; i++) { c[i] = a[i] + b[i]; } } 
 产生 
 dot_int(int*, int*, int*): xorl %eax, %eax .L2: movdqa (%rdi,%rax), %xmm0 paddd (%rsi,%rax), %xmm0 movaps %xmm0, (%rdx,%rax) addq $16, %rax cmpq $4096, %rax jne .L2 rep ret 
 但是，如果我们删除了允许与c重叠的__restrict on a我已经通过查看程序集来确定了 
 void dot_int(int * a, int * __restrict b, int * __restrict c) { a = (int*)__builtin_assume_aligned (a, 16); b = (int*)__builtin_assume_aligned (b, 16); c = (int*)__builtin_assume_aligned (c, 16); for(int i=0; i<1024; i++) { c[i] = a[i] + b[i]; } } 
 是完全相同的 
 __attribute__((optimize("no-tree-vectorize"))) inline void dot_SSE(int * __restrict a, int * __restrict b, int * __restrict c) { for(int i=0; i<1024; i+=4) { __m128i a4 = _mm_load_si128((__m128i*)&a[i]); __m128i b4 = _mm_load_si128((__m128i*)&a[i]); __m128i c4 = _mm_add_epi32(a4,b4); _mm_store_si128((__m128i*)&c[i],c4); } } __attribute__((optimize("no-tree-vectorize"))) void dot_int(int * __restrict a, int * __restrict b, int * __restrict c) { a = (int*)__builtin_assume_aligned (a, 16); b = (int*)__builtin_assume_aligned (b, 16); c = (int*)__builtin_assume_aligned (c, 16); int pass = 1; if((c+4) 
  换句话说，如果a和c指针彼此在16个字节之内（| ac | <4），则代码分支到非矢量化形式。 这证实了我的猜测，自动矢量化代码包括矢量化和非矢量化版本以处理重叠。 
 在重叠arrays上，这得到了正确的结果：2 3 4 5 6 7 8 9



  使用void *键入punning而不破坏C99中的严格别名规则
  将＆符号应用于C / C ++或任何其他此类语言的变量或数据类型时返回的是什么类型的地址？
	从C中的文件打印标记化数据
从C ++ / CLI Visual Studio 2010调用使用VS 2005编译的本机C  – 无法打开.lib文件…
如何使用Turbo C ++编译器在Notepad ++中编译和调试C ++
遵循C中的符号链接
使用零流优化更新MD5 / SHA1
Scanf解析字符串输入到字符数组
setrlimit不可靠？
基于第一列排序二维数组
是否可以简单地序列化C ++对象

重叠数组的总和，自动矢量化和限制

为什么__sync_add_and_fetch适用于32位系统上的64位变量？

C指针指针和seg错误

C样式结构声明

在编译时查找数组元素位置

使用指针实现strcat

将C / C ++unit testing与maven集成

便携式捕获信号并向用户报告问题的方法

WAV文件分析C（libsndfile，fftw3）

练习1-18在K＆R C编程语言中

C / C ++优化数据结构，数组数组或只是数组