C中最快的去交织操作？

我有一个指向mixed字节数组的指针，它包含两个不同数组array1和array2的交错字节。 mixed看起来像这样：

 a1b2c3d4...

我需要做的是解交织字节，所以我得到array1 = abcd...和array2 = 1234... 我知道提前mixed的长度，并且array1和array2的长度是等价的，都等于mixed / 2 。

这是我当前的实现（已分配array1和array2 ）：

 int i, j; int mixedLength_2 = mixedLength / 2; for (i = 0, j = 0; i < mixedLength_2; i++, j += 2) { array1[i] = mixed[j]; array2[i] = mixed[j+1]; }

这避免了任何昂贵的乘法或除法运算，但仍然运行得不够快。我希望有一些像memcpy这样的东西，它可以使用一个可以使用低级块复制操作的索引器来加速这个过程。是否有比我现有的更快的实施？

编辑

目标平台是针对iOS和Mac的Objective-C。对于iOS设备而言，快速操作更为重要，因此针对iOS的解决方案将比没有更好。

更新

感谢大家的回应，尤其是Stephen Canon，Graham Lee和Mecki。这是我的“主”function，使用Stephen的NEON内在函数（如果可用）和Graham的联合游标，Mecki建议的迭代次数减少。

 void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength) { #if defined __ARM_NEON__ // attempt to use NEON intrinsics // iterate 32-bytes at a time div_t dstABLength_32 = div(dstABLength, 32); if (dstABLength_32.rem == 0) { while (dstABLength_32.quot --> 0) { const uint8x16_t a = vld1q_u8(srcA); const uint8x16_t b = vld1q_u8(srcB); const uint8x16x2_t ab = { a, b }; vst2q_u8(dstAB, ab); srcA += 16; srcB += 16; dstAB += 32; } return; } // iterate 16-bytes at a time div_t dstABLength_16 = div(dstABLength, 16); if (dstABLength_16.rem == 0) { while (dstABLength_16.quot --> 0) { const uint8x8_t a = vld1_u8(srcA); const uint8x8_t b = vld1_u8(srcB); const uint8x8x2_t ab = { a, b }; vst2_u8(dstAB, ab); srcA += 8; srcB += 8; dstAB += 16; } return; } #endif // if the bytes were not aligned properly // or NEON is unavailable, fall back to // an optimized iteration // iterate 8-bytes at a time div_t dstABLength_8 = div(dstABLength, 8); if (dstABLength_8.rem == 0) { typedef union { uint64_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow; } ab8x8_t; uint64_t *dstAB64 = (uint64_t *)dstAB; int j = 0; for (int i = 0; i < dstABLength_8.quot; i++) { ab8x8_t cursor; cursor.narrow.a1 = srcA[j ]; cursor.narrow.b1 = srcB[j++]; cursor.narrow.a2 = srcA[j ]; cursor.narrow.b2 = srcB[j++]; cursor.narrow.a3 = srcA[j ]; cursor.narrow.b3 = srcB[j++]; cursor.narrow.a4 = srcA[j ]; cursor.narrow.b4 = srcB[j++]; dstAB64[i] = cursor.wide; } return; } // iterate 4-bytes at a time div_t dstABLength_4 = div(dstABLength, 4); if (dstABLength_4.rem == 0) { typedef union { uint32_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow; } ab8x4_t; uint32_t *dstAB32 = (uint32_t *)dstAB; int j = 0; for (int i = 0; i < dstABLength_4.quot; i++) { ab8x4_t cursor; cursor.narrow.a1 = srcA[j ]; cursor.narrow.b1 = srcB[j++]; cursor.narrow.a2 = srcA[j ]; cursor.narrow.b2 = srcB[j++]; dstAB32[i] = cursor.wide; } return; } // iterate 2-bytes at a time div_t dstABLength_2 = div(dstABLength, 2); typedef union { uint16_t wide; struct { uint8_t a; uint8_t b; } narrow; } ab8x2_t; uint16_t *dstAB16 = (uint16_t *)dstAB; for (int i = 0; i  0) { const uint8x16x2_t ab = vld2q_u8(srcAB); vst1q_u8(dstA, ab.val[0]); vst1q_u8(dstB, ab.val[1]); srcAB += 32; dstA += 16; dstB += 16; } return; } // iterate 16-bytes at a time div_t srcABLength_16 = div(srcABLength, 16); if (srcABLength_16.rem == 0) { while (srcABLength_16.quot --> 0) { const uint8x8x2_t ab = vld2_u8(srcAB); vst1_u8(dstA, ab.val[0]); vst1_u8(dstB, ab.val[1]); srcAB += 16; dstA += 8; dstB += 8; } return; } #endif // if the bytes were not aligned properly // or NEON is unavailable, fall back to // an optimized iteration // iterate 8-bytes at a time div_t srcABLength_8 = div(srcABLength, 8); if (srcABLength_8.rem == 0) { typedef union { uint64_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow; } ab8x8_t; uint64_t *srcAB64 = (uint64_t *)srcAB; int j = 0; for (int i = 0; i < srcABLength_8.quot; i++) { ab8x8_t cursor; cursor.wide = srcAB64[i]; dstA[j ] = cursor.narrow.a1; dstB[j++] = cursor.narrow.b1; dstA[j ] = cursor.narrow.a2; dstB[j++] = cursor.narrow.b2; dstA[j ] = cursor.narrow.a3; dstB[j++] = cursor.narrow.b3; dstA[j ] = cursor.narrow.a4; dstB[j++] = cursor.narrow.b4; } return; } // iterate 4-bytes at a time div_t srcABLength_4 = div(srcABLength, 4); if (srcABLength_4.rem == 0) { typedef union { uint32_t wide; struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow; } ab8x4_t; uint32_t *srcAB32 = (uint32_t *)srcAB; int j = 0; for (int i = 0; i < srcABLength_4.quot; i++) { ab8x4_t cursor; cursor.wide = srcAB32[i]; dstA[j ] = cursor.narrow.a1; dstB[j++] = cursor.narrow.b1; dstA[j ] = cursor.narrow.a2; dstB[j++] = cursor.narrow.b2; } return; } // iterate 2-bytes at a time div_t srcABLength_2 = div(srcABLength, 2); typedef union { uint16_t wide; struct { uint8_t a; uint8_t b; } narrow; } ab8x2_t; uint16_t *srcAB16 = (uint16_t *)srcAB; for (int i = 0; i < srcABLength_2.quot; i++) { ab8x2_t cursor; cursor.wide = srcAB16[i]; dstA[i] = cursor.narrow.a; dstB[i] = cursor.narrow.b; } }

在我的脑海中，我不知道用于解交织2个通道字节数据的库函数。但是，值得向Apple提交错误报告以请求此类function。

与此同时，使用NEON或SSE内在函数对这样的函数进行矢量化非常容易。具体来说，在ARM上，您将需要使用vld1q_u8加载来自每个源数组的向量， vuzpq_u8来对它们进行去交织，并使用vst1q_u8来存储生成的向量; 这是一个粗略的草图，我没有测试过甚至试图建立，但它应该说明一般的想法。更复杂的实现肯定是可能的（特别是，NEON可以在一条指令中加载/存储两个 16B寄存器，编译器可能不会这样做，并且根据缓冲区的长度，一些流水线和/或展开可能是有益的是）：

 #if defined __ARM_NEON__ # include  #endif #include  #include  void deinterleave(uint8_t *mixed, uint8_t *array1, uint8_t *array2, size_t mixedLength) { #if defined __ARM_NEON__ size_t vectors = mixedLength / 32; mixedLength %= 32; while (vectors --> 0) { const uint8x16_t src0 = vld1q_u8(mixed); const uint8x16_t src1 = vld1q_u8(mixed + 16); const uint8x16x2_t dst = vuzpq_u8(src0, src1); vst1q_u8(array1, dst.val[0]); vst1q_u8(array2, dst.val[1]); mixed += 32; array1 += 16; array2 += 16; } #endif for (size_t i=0; i



		      	 我只是轻轻地测试了它，但它似乎至少是你版本的两倍： 
 typedef union { uint16_t wide; struct { uint8_t top; uint8_t bottom; } narrow; } my_union; uint16_t *source = (uint16_t *)mixed; for (int i = 0; i < mixedLength/2; i++) { my_union cursor; cursor.wide = source[i]; array1[i] = cursor.narrow.top; array2[i] = cursor.narrow.bottom; } 
 请注意，我对结构打包并不小心，但在这种情况下，这种架构不是问题。 另请注意，有人可能会因为我选择命名top和bottom抱怨; 我假设你知道你需要哪一半整数。 



		      	 好的，这是你原来的方法： 
 static void simpleDeint ( uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength ) { int i, j; int mixedLength_2 = mixedLength / 2; for (i = 0, j = 0; i < mixedLength_2; i++, j += 2) { array1[i] = mixed[j]; array2[i] = mixed[j+1]; } } 
 有1000万个条目和-O3 （编译器应优化最大速度），我可以在Mac上每秒运行154次。 
 这是我的第一个建议： 
 static void structDeint ( uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength ) { int i; int len; uint8_t * array1Ptr = (uint8_t *)array1; uint8_t * array2Ptr = (uint8_t *)array2; struct { uint8_t byte1; uint8_t byte2; } * tb = (void *)mixed; len = mixedLength / 2; for (i = 0; i < len; i++) { *(array1Ptr++) = tb->byte1; *(array2Ptr++) = tb->byte2; tb++; } } 
 与以前相同的计数和优化，我每秒获得193次运行。 
 现在格雷厄姆·李的建议： 
 static void unionDeint ( uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength ) { union my_union { uint16_t wide; struct { uint8_t top; uint8_t bottom; } narrow; }; uint16_t * source = (uint16_t *)mixed; for (int i = 0; i < mixedLength/2; i++) { union my_union cursor; cursor.wide = source[i]; array1[i] = cursor.narrow.top; array2[i] = cursor.narrow.bottom; } } 
 与之前相同的设置，每秒198次运行（注意：此方法不是字节式安全，结果取决于CPU的endianess。在您的情况下，由于ARM是小端，因此可能会交换array1和array2，因此您必须在代码中交换它们）。 
 到目前为止，这是我最好的一个： 
 static void uint32Deint ( uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength ) { int i; int count; uint32_t * fourBytes = (void *)mixed; uint8_t * array1Ptr = (uint8_t *)array1; uint8_t * array2Ptr = (uint8_t *)array2; count = mixedLength / 4; for (i = 0; i < count; i++) { uint32_t temp = *(fourBytes++); #if __LITTLE_ENDIAN__ *(array1Ptr++) = (uint8_t)(temp & 0xFF); temp >>= 8; *(array2Ptr++) = (uint8_t)(temp & 0xFF); temp >>= 8; *(array1Ptr++) = (uint8_t)(temp & 0xFF); temp >>= 8; *(array2Ptr++) = tb->byte2; #else *(array1Ptr++) = (uint8_t)(temp >> 24); *(array2Ptr++) = (uint8_t)((temp >> 16) & 0xFF); *(array1Ptr++) = (uint8_t)((temp >> 8) & 0xFF); *(array2Ptr++) = (uint8_t)(temp & 0xFF); #endif } // Either it is a multiple of 4 or a multiple of 2. // If it is a multiple of 2, 2 bytes are left over. if (count * 4 != mixedLength) { *(array1Ptr) = mixed[mixedLength - 2]; *(array2Ptr) = mixed[mixedLength - 1]; } } 
 与上面相同的设置，每秒219次，除非我犯了错误，否则应该使用任何一个endianess。 



		      	 我推荐格雷厄姆的解决方案，但如果这对速度非常重要并且你愿意去assembly，你可以更快。 
 这个想法是这样的： 


 从mixed读取整个32位整数。 你会得到’a1b2’。 


 将低16位旋转8位以获得’1ab2’（我们使用小端，因为这是ARM中的默认值，因此Apple A＃，因此前两个字节是较低的字节）。 


 将整个32位寄存器向右旋转（我认为是正确的……）8位以获得’21ab’。 


 将低16位旋转8位以获得’12ab’ 


 将低8位写入array2 。 


 将整个32位寄存器旋转16位。 


 将低8位写入array1 


 将array1推进16位， array2推进16位， mixed 32位。 


 重复。 


 我们交换了2个内存读取（假设我们使用Graham的版本或等价物）和4个内存，一个内存读取，两个内存写入和4个寄存器操作。 虽然操作次数从6次增加到7次，但是寄存器操作比内存操作更快，因此它的效率更高。 此外，由于我们一次从mixed 32位读取而不是16位，因此我们将迭代管理减少了一半。 
  PS：从理论上讲，这也适用于64位架构，但对’a1b2c3d4’进行所有这些旋转会让你感到疯狂。 



		      	 对于x86 SSE， pack和punpck指令是您所需要的。 使用AVX的示例，以方便非破坏性3操作数指令。  （不使用AVX2 256b宽指令，因为256b打包/取消指令在低和高128b通道中执行两次128b解压缩，因此您需要随机播放以获得正确的最终顺序。） 
 以下内在版本的工作方式相同。 只需编写快速答案，Asm指令的类型就会缩短。 
  交错 ： abcd和1234 – > a1b2c3d4 ： 
 # loop body: vmovdqu (%rax), %xmm0 # load the sources vmovdqu (%rbx), %xmm1 vpunpcklbw %xmm0, %xmm1, %xmm2 # low halves -> 128b reg vpunpckhbw %xmm0, %xmm2, %xmm3 # high halves -> 128b reg vmovdqu %xmm2, (%rdi) # store the results vmovdqu %xmm3, 16(%rdi) # blah blah some loop structure. `punpcklbw` interleaves the bytes in the low 64 of the two source `xmm` registers. There are `..wd` (word->dword), and dword->qword versions which would be useful for 16 or 32bit elements. 
  解交织 ： a1b2c3d4 – > abcd和1234 
 #outside the loop vpcmpeqb %xmm5, %xmm5 # set to all-1s vpsrlw $8, %xmm5, %xmm5 # every 16b word has low 8b = 0xFF, high 8b = 0. # loop body vmovdqu (%rsi), %xmm2 # load two src chunks vmovdqu 16(%rsi), %xmm3 vpand %xmm2, %xmm5, %xmm0 # mask to leave only the odd bytes vpand %xmm3, %xmm5, %xmm1 vpackuswb %xmm0, %xmm1, %xmm4 vmovdqu %xmm4, (%rax) # store 16B of a[] vpsrlw $8, %xmm2, %xmm6 # even bytes -> odd bytes vpsrlw $8, %xmm3, %xmm7 vpackuswb %xmm6, %xmm7, %xmm4 vmovdqu %xmm4, (%rbx) 
 这当然可以使用更少的寄存器。 我避免重复使用寄存器的可读性，而不是性能。 硬件寄存器重命名使重用成为非问题，只要您从不依赖于先前值的内容开始。  （例如， movss ，not movss或pinsrd 。） 
 由于pack指令执行有符号或无符号饱和，因此去交错是更多的工作，因此每个16b元素的高8b必须首先归零。 
 另一种方法是使用pshufb将单个源寄存器的奇数或偶pshufb到寄存器的低64位。 但是，在AMD XOP指令集的VPPERM ，没有可以同时从2个寄存器中选择字节的混洗（就像Altivec非常喜欢的vperm ）。 因此，对于SSE / AVX，每128b交错数据需要2次重复。 而且由于存储端口的使用可能是瓶颈， punpck需要将两个64位块的a成一个寄存器来设置128b存储。 
 使用AMD XOP，deinterleave将是2x128b负载，2个VPPERM和2x128b存储。 



		      	

 过早优化是不好的 


 你的编译器可能比你更好地优化。 


 也就是说，有些事情可以帮助编译器，因为您具有编译器不能拥有的数据的语义知识： 


 读取和写入尽可能多的字节，直到本机字大小 – 内存操作很昂贵，所以尽可能在寄存器中进行操作 


 展开循环 – 查看“Duff的设备”。 


  FWIW，我制作了两个版本的复制循环，一个与你的版本大致相同，第二个使用了大多数人认为的“最佳”（尽管仍然很简单）C代码： 
 void test1(byte *p, byte *p1, byte *p2, int n) { int i, j; for (i = 0, j = 0; i < n / 2; i++, j += 2) { p1[i] = p[j]; p2[i] = p[j + 1]; } } void test2(byte *p, byte *p1, byte *p2, int n) { while (n) { *p1++ = *p++; *p2++ = *p++; n--; n--; } } 
 在Intel x86上使用gcc -O3 -S ，它们都生成了几乎相同的汇编代码。 这是内循环： 
 LBB1_2: movb -1(%rdi), %al movb %al, (%rsi) movb (%rdi), %al movb %al, (%rdx) incq %rsi addq $2, %rdi incq %rdx decq %rcx jne LBB1_2 
 和 
 LBB2_2: movb -1(%rdi), %al movb %al, (%rsi) movb (%rdi), %al movb %al, (%rdx) incq %rsi addq $2, %rdi incq %rdx addl $-2, %ecx jne LBB2_2 
 两者都具有相同数量的指令，差异仅仅因为第一个版本计数到n / 2而第二个计数减少到零。 
  编辑这里是一个更好的版本： 
 /* non-portable - assumes little endian */ void test3(byte *p, byte *p1, byte *p2, int n) { ushort *ps = (ushort *)p; n /= 2; while (n) { ushort n = *ps++; *p1++ = n; *p2++ = n >> 8; } } 
 导致： 
 LBB3_2: movzwl (%rdi), %ecx movb %cl, (%rsi) movb %ch, (%rdx) # NOREX addq $2, %rdi incq %rsi incq %rdx decq %rax jne LBB3_2 
 这是少一条指令，因为它利用了对%cl和%ch的直接访问。



  是否有可用于替代脚本的低级语言？
  具有位字段的结构中的偏移量
	访问冲突写入位置0x00000000。  从键盘读取int
AMD64  –  nopw汇编指令？
奇怪的宏定义问题
文件范围和全局范围：C＆C ++
Python ctypes：在c中调用具有自定义类型的函数
为什么运行时库是编译器选项而不是链接器选项？
我们转换Descriptor + Descriptor-> SerialNumberOffset时序列号枚举失败
CUDA矩阵乘法中断了大型矩阵
圆形链表在c窗口中无法正常工作

C中最快的去交织操作？

在Linux内核源代码中实现系统调用/陷阱

使用getchar立即检索击键

C中＆指针的结果是什么？

不满意的链接错误：java.library.path中没有库文件

如何在C中进行逐位XOR

在C / C ++中在本地时间和GMT / UTC之间转换

在main中不使用char ** argv访问命令行参数

从.p12文件中提取客户端证书和私钥

如何将hex数字打印到文件？

迭代char数组和打印字符