获取64位整数内的位位数组

好吧，听起来有点复杂，但这正是我想要做的：

以10101010101为例
并返回{ 0, 2, 4, 6, 8, 10 } – 一个数组，其中包含所有位的位置

这是我的代码：

 UINT DQBitboard::firstBit(U64 bitboard) { static const int index64[64] = { 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5 }; static const U64 debruijn64 = 0x07EDD5E59A4E28C2ULL; #pragma warning (disable: 4146) return index64[((bitboard & -bitboard) * debruijn64) >> 58]; } vector DQBitboard::bits(U64 bitboard) { vector res; while (bitboard) { UINT first = DQBitboard::firstBit(bitboard); res.push_back(first); bitboard &= ~(1ULL<<first); } return res; }

代码肯定会起作用 。

我的观点是：

你有没有更快的实施？
你注意到任何可以优化的东西吗？如果是这样，什么？

提示：

UINT是unsigned int的typedef
U64是unsigned long long的typedef
两种方法都是static inline 。

这是另一个可以分析的建议（可以与其他建议结合使用以进一步优化）。注意，这里的循环是O(number of set bits) 。

 vector bits_set (UINT64 data) { UINT n; vector res; res.reserve(64); for (n = 0; data != 0; n++, data &= (data - 1)) { res.push_back(log2(data & ~(data-1))); } return res; }

比特转换非常便宜。查找表占用缓存空间，查找也有整数乘法。我希望，蛮力将比聪明的技术更快。

 vector DQBitboard::bits(U64 bitboard) { vector res; res.reserve(64); uint_fast8_t pos = 1; do { if (bitboard & 1) res.push_back(pos); ++pos; } while (bitboard >>= 1); return res; }

您可以稍微展开循环，这可能会使它更快。

到目前为止， std::vector是最昂贵的部分。考虑直接使用位板。例如：

 struct bitboard_end_iterator{}; struct bitboard_iterator { U64 value; uint_fast8_t pos; bitboard_iterator(U64 bitboard) : value(bitboard), pos(0) { ++(*this); } UINT operator*() const { return pos + 1; } bool operator==( bitboard_end_iterator ) const { return pos == 64; } operator bool() const { return pos < 64; } bitboard_iterator& operator++() { while (U64 prev = value) { value >>= 1; ++pos; if (prev & 1) return *this; } pos = 64; return *this; } };

现在你可以写了

 for( bitboard_iterator it(bitboard); it; ++it ) cout << *it;

而且我想你会得到你的位列表。

版本2：

 class bitboard_fast_iterator { U64 value; uint_fast8_t pos; public: bitboard_fast_iterator(U64 bitboard = 0) : value(bitboard), pos(__builtin_ctzll(value)) {} UINT operator*() const { return pos + 1; } operator bool() const { return value != 0; } bitboard_iterator& operator++() { value &= ~(1ULL << pos); pos = __builtin_ctzll(value); return *this; } };

我一直想知道它是否能更快地使用bst汇编指令。所以我尝试了3次实现，并获得了1000万次迭代的以下结果：

你的实施（Dr.Kameleon）1.77秒

log2（）实现（icepack）2.17秒

我的assembly实施（我）0.16秒

输出：

 bits version: Function started at 0 ended at 177 spent 177 (1.770000 seconds) c version: Function started at 177 ended at 394 spent 217 (2.170000 seconds) c version: Function started at 394 ended at 410 spent 16 (0.160000 seconds)

关于C / C ++的一点，静态是可怕的。它实际上是在CPU指令列表中编译的（不是我所期望的!!!）而是在无名空间中使用函数外部的数组。这将产生预期的效果。虽然在汇编中你可以使用.long（或其他一些大小）然后％rip来引用来自IP的数据。

注意：一旦编译，我没有看到我的汇编版本中使用的大小（n），所以我不太确定返回的数组是否有效。除此之外，代码本身变成了5个汇编指令的循环，因此速度微小增加（大约x10）。

log2（）慢的原因是它将数字转换为xmm寄存器，然后调用另一个函数。然后它将xmm寄存器转换回常规寄存器。

 #include  #include  #include  #include  #include  #include  #include  #include  using namespace std; namespace { const int index64[64] = { 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5 }; const uint64_t debruijn64 = 0x07EDD5E59A4E28C2ULL; } int firstBit(uint64_t bitboard) { return index64[((bitboard & -bitboard) * debruijn64) >> 58]; } vector bits(uint64_t bitboard) { vector res; res.reserve(64); while(bitboard) { int first = firstBit(bitboard); res.push_back(first); bitboard &= ~(1ULL << first); } return res; } vector bits_c(uint64_t bitboard) { int n; vector res; res.reserve(64); for (n = 0; bitboard != 0; n++, bitboard &= (bitboard - 1)) { res.push_back(log2(bitboard & ~(bitboard - 1))); } return res; } vector bits_asm(uint64_t bitboard) { int64_t n(0); int res[64]; asm( "bsf %[b], %%rax\n\t" "je exit\n\t" ".align 16\n" "loop:\n\t" "mov %%eax, (%[r],%[n],4)\n\t" "btr %%rax, %[b]\n\t" "inc %[n]\n\t" "bsf %[b], %%rax\n\t" "je loop\n" "exit:\n\t" : /* output */ "=r" (n) : /* input */ [n] "r" (n), [r] "r" (res), [b] "r" (bitboard) : /* state */ "eax", "cc" ); return vector(res, res + n); } class run_timer { public: run_timer() { } void start() { times(&f_start); } void stop() { times(&f_stop); } void report(const char *msg) { printf("%s:\n" "Function started at %ld\n" " ended at %ld\n" " spent %ld (%f seconds)\n", msg, f_start.tms_utime, f_stop.tms_utime, f_stop.tms_utime - f_start.tms_utime, (double)(f_stop.tms_utime - f_start.tms_utime)/(double)sysconf(_SC_CLK_TCK)); } struct tms f_start; struct tms f_stop; }; int main(int argc, char *argv[]) { run_timer t; t.start(); for(int i(0); i < 10000000; ++i) { bits(rand()); } t.stop(); t.report("bits version"); t.start(); for(int i(0); i < 10000000; ++i) { bits_c(rand()); } t.stop(); t.report("c version"); t.start(); for(int i(0); i < 10000000; ++i) { bits_asm(rand()); } t.stop(); t.report("c version"); return 0; }

使用此命令行使用g ++编译：

 c++ -msse4.2 -O2 -o bits -c bits.cpp

虽然您可能认为-msse4.2可能是log2（）版本的问题，但我尝试了没有它，然后log2（）慢了。

顺便说一句，我不推荐这种方法，因为它不便携。只有基于Intel的计算机才能理解这些指令。

使用BSF或BSR指令将您的firstBit函数替换为内部函数，以实现大规模加速。

在gcc中，它是__builtin_ffsll和__builtin_ctzll

使用Visual C +， _BitScanForward和_BitScanReverse

我现在能想到的最快就是使用预先生成的 ~~map~~ 所有数字的数组（它不必是所有数字，你可以例如在8位或16位部分中打破数字，然后将返回的数组与一些适当的加法连接起来以说明位的实际位置）。

我尝试了一个天真的版本，其速度提高了大约2-3倍，但是先保留了向量（）。在将保留应用于原始算法时，它击败了天真的算法。

所以我怀疑向量运算在这里是更大的成本，而不是位操作（甚至是下一位函数中使用的乘法）。

关于找到最低设置位，还有一些其他的加速。我的本地log2版本很差，原帖中给出的function并不便宜。

这是我最好的尝试：

 void bits(uint64 bitboard, vector &res) { res.resize(64); int i = 0; while (bitboard) { int first; _BitScanForward64((DWORD *)&first, bitboard); res[i++] = first; bitboard &= ~(1ULL<

用asm内在替换firstBit函数。使用内在函数在这里给了很大的推动。（这显然不是可移植代码，但我怀疑GCC变体不应该太棘手）。

还假设向量是合理持久的，而不是一直动态分配/复制，只是适当地resize。

 const size_t size = sizeof(U64)*8; U64 val = 1; vector res; res.reserve(size); for ( size_t i = size; i > 0; --i ) { if ( ( val & bitboard ) != 0 ) { res.push_back(i); } val <<= 1; }

我实际上认为最简单，最简单的方法只是循环，但如果我们传入一个向量而不是以后制作副本，它应该更快一点。

 void DQBitboard::bits(U64 bitboard, vector &res) { res.clear(); // Make sure vector is empty. Not necessary if caller does this! int bit = 0; while (bitboard) { if (bitboard & 1) res.push_back(bit); bit++; bitboard >>= 1; } return res; }

在findfirst中的乘法将花费一点，所以我怀疑它是否真的值得。

获取64位整数内的位位数组

用64位int移位

uint64_t变量中C的按位移位运算

跟踪位操作的优化机会？

C左移64位失败

Endian表示64位值