快速计算两个数组之间的相等字节数

我写了函数int compare_16bytes(__m128i lhs, __m128i rhs) ，以便使用SSE指令比较两个16字节数：此函数返回执行比较后相等的字节数。

现在我想使用上面的函数来比较任意长度的两个字节数组：长度可能不是16字节的倍数，所以我需要处理这个问题。我怎样才能完成下面这个function的实现？我怎样才能改进下面的function？

 int fast_compare(const char* s, const char* t, int length) { int result = 0; const char* sPtr = s; const char* tPtr = t; while(...) { const __m128i* lhs = (const __m128i*)sPtr; const __m128i* rhs = (const __m128i*)tPtr; // compare the next 16 bytes of s and t result += compare_16bytes(*lhs,*rhs); sPtr += 16; tPtr += 16; } return result; }

正如@Mysticial在上面的评论中所说，做垂直比较和求和，然后在主循环结束时水平求和：

 #include  #include  #include  #include  // reference implementation int fast_compare_ref(const char *s, const char *t, int length) { int result = 0; int i; for (i = 0; i < length; ++i) { if (s[i] == t[i]) result++; } return result; } // optimised implementation int fast_compare(const char *s, const char *t, int length) { int result = 0; int i; __m128i vsum = _mm_set1_epi32(0); for (i = 0; i < length - 15; i += 16) { __m128i vs, vt, v, vh, vl, vtemp; vs = _mm_loadu_si128((__m128i *)&s[i]); // load 16 chars from input vt = _mm_loadu_si128((__m128i *)&t[i]); v = _mm_cmpeq_epi8(vs, vt); // compare vh = _mm_unpackhi_epi8(v, v); // unpack compare result into 2 x 8 x 16 bit vectors vl = _mm_unpacklo_epi8(v, v); vtemp = _mm_madd_epi16(vh, vh); // accumulate 16 bit vectors into 4 x 32 bit partial sums vsum = _mm_add_epi32(vsum, vtemp); vtemp = _mm_madd_epi16(vl, vl); vsum = _mm_add_epi32(vsum, vtemp); } // get sum of 4 x 32 bit partial sums vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); result = _mm_cvtsi128_si32(vsum); // handle any residual bytes ( < 16) if (i < length) { result += fast_compare_ref(&s[i], &t[i], length - i); } return result; } // test harness int main(void) { const int n = 1000000; char *s = malloc(n); char *t = malloc(n); int i, result_ref, result; srand(time(NULL)); for (i = 0; i < n; ++i) { s[i] = rand(); t[i] = rand(); } result_ref = fast_compare_ref(s, t, n); result = fast_compare(s, t, n); printf("result_ref = %d, result = %d\n", result_ref, result);; return 0; }

编译并运行上面的测试工具：

 $ gcc -Wall -O3 -msse3 fast_compare.c -o fast_compare $ ./fast_compare result_ref = 3955, result = 3955 $ ./fast_compare result_ref = 3947, result = 3947 $ ./fast_compare result_ref = 3945, result = 3945

注意，在上面的SSE代码中有一个可能非显而易见的技巧，我们使用_mm_madd_epi16来解包并将16位0 / -1值累加到32位部分和。我们利用了-1*-1 = 1 （当然0*0 = 0 ）这一事实 - 我们在这里并没有真正进行乘法，只需在一条指令中解包和求和。

更新：如下面的评论中所述，这个解决方案并不是最优的 - 我只采用了一个相当优化的16位解决方案，并添加了8位到16位解包，使其适用于8位数据。但是对于8位数据，有更有效的方法，例如使用psadbw / _mm_sad_epu8 。我将把这个答案留给后人，对于那些可能想要用16位数据做这种事情的人，但实际上其中一个不需要解压缩输入数据的答案应该是接受的答案。

在16 x uint8元素中使用部分和可以提供更好的性能。
我把循环划分为内循环和外循环。
内循环求和uint8元素（每个uint8元素总和可达255“1”）。
小技巧：_mm_cmpeq_epi8将相等元素设置为0xFF，并且（char）0xFF = -1，因此可以从总和中减去结果（减去-1以添加1）。

这是我对fast_compare的优化版本：

 int fast_compare2(const char *s, const char *t, int length) { int result = 0; int inner_length = length; int i; int j = 0; //Points beginning of 4080 elements block. const char *s0 = s; const char *t0 = t; __m128i vsum = _mm_setzero_si128(); //Outer loop sum result of 4080 sums. for (i = 0; i < length; i += 4080) { __m128i vsum_uint8 = _mm_setzero_si128(); //16 uint8 sum elements (each uint8 element can sum up to 255). __m128i vh, vl, vhl, vhl_lo, vhl_hi; //Points beginning of 4080 elements block. s0 = s + i; t0 = t + i; if (i + 4080 <= length) { inner_length = 4080; } else { inner_length = length - i; } //Inner loop - sum up to 4080 (compared) results. //Each uint8 element can sum up to 255. 16 uint8 elements can sum up to 255*16 = 4080 (compared) results. ////////////////////////////////////////////////////////////////////////// for (j = 0; j < inner_length-15; j += 16) { __m128i vs, vt, v; vs = _mm_loadu_si128((__m128i *)&s0[j]); // load 16 chars from input vt = _mm_loadu_si128((__m128i *)&t0[j]); v = _mm_cmpeq_epi8(vs, vt); // compare - set to 0xFF where equal, and 0 otherwise. //Consider this: (char)0xFF = (-1) vsum_uint8 = _mm_sub_epi8(vsum_uint8, v); //Subtract the comparison result - subtract (-1) where equal. } ////////////////////////////////////////////////////////////////////////// vh = _mm_unpackhi_epi8(vsum_uint8, _mm_setzero_si128()); // unpack result into 2 x 8 x 16 bit vectors vl = _mm_unpacklo_epi8(vsum_uint8, _mm_setzero_si128()); vhl = _mm_add_epi16(vh, vl); //Sum high and low as uint16 elements. vhl_hi = _mm_unpackhi_epi16(vhl, _mm_setzero_si128()); //unpack sum of vh an vl into 2 x 4 x 32 bit vectors vhl_lo = _mm_unpacklo_epi16(vhl, _mm_setzero_si128()); //unpack sum of vh an vl into 2 x 4 x 32 bit vectors vsum = _mm_add_epi32(vsum, vhl_hi); vsum = _mm_add_epi32(vsum, vhl_lo); } // get sum of 4 x 32 bit partial sums vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); result = _mm_cvtsi128_si32(vsum); // handle any residual bytes ( < 16) if (j < inner_length) { result += fast_compare_ref(&s0[j], &t0[j], inner_length - j); } return result; }

大输入的最快方法是Rotem的答案，其中内部循环是pcmpeqb / psubb ，在向量累加器的任何字节元素溢出之前突破到水平和。使用psadbw对全零向量执行无符号字节的psadbw 。

如果没有展开/嵌套循环，最好的选择可能就是

 pcmpeqb -> vector of 0 or 0xFF elements psadbw -> two 64bit sums of (0*no_matches + 0xFF*matches) paddq -> accumulate the psadbw result in a vector accumulator #outside the loop: horizontal sum divide the result by 255

如果循环中没有很多寄存器压力，则psadbw的向量为0x7f而不是全零。

psadbw(0x00, set1(0x7f)) => sum += 0x7f
psadbw(0xff, set1(0x7f)) => sum += 0x80

因此，不是除以255（编译器应该在没有实际div情况下有效地执行），而是必须减去n * 0x7f ，其中n是元素的数量。

另请注意，在Nehalem和Atom之前paddq速度很慢，因此如果您不希望128 *计数溢出32位整数，则可以使用paddd （ _mm_add_epi32 ）。

这与Paul R的pcmpeqb / 2x punpck / 2x pmaddwd / 2x paddw 。

SSE中的整数比较产生全部为零或全部为1的字节。如果要计数，首先需要右移（不算术）比较结果7，然后添加到结果向量。最后，您仍然需要通过对其元素求和来减少结果向量。这种减少必须在标量代码中完成，或者通过一系列添加/移位来完成。通常这部分不值得麻烦。

快速计算两个数组之间的相等字节数

Objective-C检查是否定义了Structs

SQLite3并限制结果数量

如何在C中的for（;;）循环中初始化几个变量？

在这种情况下我真的需要互斥锁吗？

将此代码行转换为C.

设计一个支持海量数据存储和查询的系统

做最后减少的策略

最低位的索引

Python C扩展：从引擎中提取参数

将共享库编译成程序？