什么是找到平均大小分布而不必声明一个巨大的2D数组的有效方法?
考虑L×L大小矩阵M,其条目可以是0或1.每个元素是1,概率为p,0是概率1-p。 我将标记为1的元素称为黑色元素,将标记为0的元素称为白色元素。 我正在尝试编写一个代码:
-
生成随机矩阵,条目为0和1。 我需要输入矩阵L的大小和概率p。
-
标记属于同一群集的所有黑色元素具有相同的编号。 ( 将一个黑色元素簇定义为单元格图形中的最大连通分量,其值为1,其中边连接的行和列之间的差异最多为1(因此每个单元最多有8个邻居)。如果矩阵的两个黑色元素共享边或顶点,则认为它们属于同一个黑色簇。即,将矩阵视为大方块,将元素视为大方块中的小方块。 )
- 在从p = 0到p = 100的循环内(我将概率p称为百分比):将每个大小的黑色簇的数量写入对应于该p值的文件。
例如:
输入 :p = 30,L = 50
输出 (写入每个p的数据文件;因此该程序创建了101个数据文件,从p = 0到p = 100):
1 100(即有100个大小为1的黑色簇)
2 20(即有20个黑色簇,大小为2)
3 15(即有15个黑色簇,大小为3)
等等…
#include "stdafx.h" #include #include #include #include #include #include int *labels; int n_labels = 0; int uf_find(int x) { int y = x; while (labels[y] != y) y = labels[y]; while (labels[x] != x) { int z = labels[x]; labels[x] = y; x = z; } return y; } int uf_union(int x, int y) { return labels[uf_find(x)] = uf_find(y); } int uf_make_set(void) { labels[0] ++; assert(labels[0] b?a:b) #define min(a,b) (a>b?b:a) int hoshen_kopelman(int **matrix, int m, int n) { uf_initialize(m * n / 2); for (int i = 0; i<m; i++) for (int j = 0; j<n; j++) if (matrix[i][j]) { int up = (i == 0 ? 0 : matrix[i - 1][j]); int left = (j == 0 ? 0 : matrix[i][j - 1]); switch (!!up + !!left) { case 0: matrix[i][j] = uf_make_set(); break; case 1: matrix[i][j] = max(up, left); break; case 2: matrix[i][j] = uf_union(up, left); break; } int north_west; if (i == 0 || j == 0) north_west = 0; else north_west = matrix[i - 1][j - 1]; int north_east; if (i == 0 || j == (n - 1)) north_east = 0; else north_east = matrix[i - 1][j + 1]; if (!!north_west == 1) { if (i != 0 && j != 0) { //if (rand() % 2 == 1) //{ if (matrix[i][j - 1] == 0 && matrix[i - 1][j] == 0) { if (!!matrix[i][j] == 0) matrix[i][j] = north_west; else uf_union(north_west, matrix[i][j]); } //} } else if (i == 0 || j == 0) { } } if (!!north_east == 1) { if (i != 0 && j != n - 1) { //if (rand() % 2 == 1) //{ if (matrix[i - 1][j] == 0 && matrix[i][j + 1] == 0) { if (!!matrix[i][j] == 0) matrix[i][j] = north_east; else uf_union(north_east, matrix[i][j]); } //} } else if (i == 0 || j == n - 1) { } } } int *new_labels = (int*)calloc(sizeof(int), n_labels); for (int i = 0; i<m; i++) for (int j = 0; j<n; j++) if (matrix[i][j]) { int x = uf_find(matrix[i][j]); if (new_labels[x] == 0) { new_labels[0]++; new_labels[x] = new_labels[0]; } matrix[i][j] = new_labels[x]; } int total_clusters = new_labels[0]; free(new_labels); uf_done(); return total_clusters; } void check_labelling(int **matrix, int m, int n) { int N, S, E, W; for (int i = 0; i<m; i++) for (int j = 0; j<n; j++) if (matrix[i][j]) { N = (i == 0 ? 0 : matrix[i - 1][j]); S = (i == m - 1 ? 0 : matrix[i + 1][j]); E = (j == n - 1 ? 0 : matrix[i][j + 1]); W = (j == 0 ? 0 : matrix[i][j - 1]); assert(N == 0 || matrix[i][j] == N); assert(S == 0 || matrix[i][j] == S); assert(E == 0 || matrix[i][j] == E); assert(W == 0 || matrix[i][j] == W); } } int cluster_count(int **matrix, int size, int N) { int i; int j; int count = 0; for (i = 0; i < size; i++) { for (j = 0; j < size; j++) { if (matrix[i][j] == N) count++; } } return count; } int main(int argc, char **argv) { srand((unsigned int)time(0)); int p = 0; printf("Enter number of rows/columns: "); int size = 0; scanf("%d", &size); printf("\n"); FILE *fp; printf("Enter number of averaging iterations: "); int iterations = 0; scanf("%d", &iterations); for (int p = 0; p <= 100; p++) { char str[100]; sprintf(str, "BlackSizeDistribution%03i.txt", p); fp = fopen(str, "w"); int **matrix; matrix = (int**)calloc(10, sizeof(int*)); int** matrix_new = (int**)calloc(10, sizeof(int*)); matrix_new = (int **)realloc(matrix, sizeof(int*) * size); matrix = matrix_new; for (int i = 0; i < size; i++) { matrix[i] = (int *)calloc(size, sizeof(int)); for (int j = 0; j < size; j++) { int z = rand() % 100; z = z + 1; if (p == 0) matrix[i][j] = 0; if (z <= p) matrix[i][j] = 1; else matrix[i][j] = 0; } } hoshen_kopelman(matrix, size, size); int highest = matrix[0][0]; for (int i = 0; i < size; i++) for (int j = 0; j < size; j++) if (highest < matrix[i][j]) highest = matrix[i][j]; int* counter = (int*)calloc(sizeof(int*), highest + 1); int high = 0; for (int k = 1; k high) high = counter[k]; } int* size_distribution = (int*)calloc(sizeof(int*), high + 1); for (int y = 1; y <= high; y++) { int count = 0; for (int z = 1; z <= highest; z++) if (counter[z] == y) count++; size_distribution[y] = count; } for (int k = 1; k <= high; k++) { fprintf(fp, "\n%d\t%d", k, size_distribution[k]); printf("\n%d\t%d", k, size_distribution[k]); } printf("\n"); check_labelling(matrix, size, size); for (int i = 0; i < size; i++) free(matrix[i]); free(matrix); } }
我使用该程序输出的数据文件,使用Gnuplot为每个p的范围从0到100创建条形图。
一些示例图:
这一个对应于输入p = 3和L = 100
这一个对应于p = 90和L = 100
好吧,所以我想这对于我所遇到的实际问题来说已经足够了。
我写的程序只输出每p值1次迭代的值。 由于该程序用于科学计算目的,我需要更准确的值,因此我需要输出“平均”大小分布值; 超过50或100次迭代。 我不确定如何进行平均。
更清楚这是我想要的:
假设程序的三个不同运行的输出给了我(比如说p = 3,L = 100)
Size Iteration 1 Iteration 2 Iteration 3 Averaged Value (Over 3 iterations) 1 150 170 180 167 2 10 20 15 18 3 1 2 1 1 4 0 0 1 0 . . .
我想做类似的事情,除了我需要执行50或100次迭代的平均以获得我的数学模型的准确值。 顺便说一句,我真的不需要输出每次迭代的值,即迭代1,迭代2,迭代3,…在数据文件中。 我需要“打印”的是上表中的第一列和最后一列(具有平均值)。
现在,我可能需要修改其主函数来进行平均。
这是主要function:
int main(int argc, char **argv) { srand((unsigned int)time(0)); int p = 0; printf("Enter number of rows/columns: "); int size = 0; scanf("%d", &size); printf("\n"); FILE *fp; printf("Enter number of averaging iterations: "); int iterations = 0; scanf("%d", &iterations); for (int p = 0; p <= 100; p++) { char str[100]; sprintf(str, "BlackSizeDistribution%03i.txt", p); fp = fopen(str, "w"); int **matrix; matrix = (int**)calloc(10, sizeof(int*)); int** matrix_new = (int**)calloc(10, sizeof(int*)); matrix_new = (int **)realloc(matrix, sizeof(int*) * size); matrix = matrix_new; for (int i = 0; i < size; i++) { matrix[i] = (int *)calloc(size, sizeof(int)); for (int j = 0; j < size; j++) { int z = rand() % 100; z = z + 1; if (p == 0) matrix[i][j] = 0; if (z <= p) matrix[i][j] = 1; else matrix[i][j] = 0; } } hoshen_kopelman(matrix, size, size); int highest = matrix[0][0]; for (int i = 0; i < size; i++) for (int j = 0; j < size; j++) if (highest < matrix[i][j]) highest = matrix[i][j]; int* counter = (int*)calloc(sizeof(int*), highest + 1); int high = 0; for (int k = 1; k high) high = counter[k]; } int* size_distribution = (int*)calloc(sizeof(int*), high + 1); for (int y = 1; y <= high; y++) { int count = 0; for (int z = 1; z <= highest; z++) if (counter[z] == y) count++; size_distribution[y] = count; } for (int k = 1; k <= high; k++) { fprintf(fp, "\n%d\t%d", k, size_distribution[k]); printf("\n%d\t%d", k, size_distribution[k]); } printf("\n"); check_labelling(matrix, size, size); for (int i = 0; i < size; i++) free(matrix[i]); free(matrix); } }
我想到的方法之一是声明一个2D数组的大小(largest size of black cluster possible for that) x (number of averaging iterations I want)
,p循环的每次运行,以及程序结束时从2Darrays中提取每个p的平均大小分布。 在尺寸为100 x 100(比如)的矩阵中,最大尺寸的黑色簇可能是10,000。 但是,对于较小的p值(比如p = 1,p = 20,……),这样的大尺寸星团甚至都不会产生! 因此,在开始时创建如此大的2Darrays将是一个可怕的内存空间浪费,执行需要数天 ! (请记住,我需要运行此程序L = 1000和L = 5000,如果可能,甚至L = 10000为我的目的)
必须有一些其他更有效的方法来做到这一点。 但我不知道是什么。 任何帮助表示赞赏。
啊,一个非常接近我自己心的话题! 🙂
因为您正在收集统计数据,并且确切的配置仅在收集所述统计数据的持续时间内有趣,您只需要两个L×L矩阵:一个用于保存颜色信息(一个白色,0到L×L黑色,所以一个类型可以保存0到L 2 +1之间的整数,包括端点); 另一个用于保存该类型的元素数(0到L 2之间的整数,包括0和L 2 )。
为了保持每个L和p值的每个大小的黑色簇的数量,在多次迭代中,您将需要第三个L×L + 1个元素的arrays; 但请注意,如果使用相同的L和p进行N次迭代,其值可以达到N×L×L。
标准的C伪随机数发生器很可怕; 不要使用它。 使用Mersenne Twister或我最喜欢的Xorshift64 * (或相关的Xorshift1024 *)。 虽然Xorshift64 *速度极快,但它的分布对于像你这样的模拟而言是足够“随机”的。
而不是只存储“黑色”或“白色”,而是存储“不感兴趣”或群集ID; 最初,所有(未连接的)黑人都有唯一的群集ID。 这样,您可以实现不相交的集合 ,并将连接的单元格非常有效地连接到集群。
以下是我在伪代码中实现的方法:
Let id[L*L] be the color information; index is L*row+col Let idcount[L*L+1] be the id count over one iteration Let count[L*L+1] be the number of clusters across N iterations Let limit = (p / 100.0) * PRNG_MAX Let white = L*L (and blacks are 0 to L*L-1, inclusive) Clear count[] to all zeros For iter = 1 to N, inclusive: For row = 0 to L-1, inclusive: If PRNG() <= limit: Let id[L*row] = L*row Else: Let id[L*row] = white End If For col = 1 to L-1, inclusive: If PRNG() <= limit: If id[L*row+col-1] == white: Let id[L*row+col] = L*row + col Else: Let id[L*row+col] = id[L*row+col-1] End If Else: Let id[L*row+col] = white End If End For End For
请注意,我在生成单元ID时执行水平连接传递,只需对连续的黑色单元重用相同的ID即可。 此时,如果PRNG()
返回0和PRNG_MAX
之间的无符号整数(包括0和PRNG_MAX
PRNG()
,则id[][]
现在以p
%的概率填充各种ID的黑色单元格。
接下来,您进行连接传递。 由于水平连接的单元格已经连接,因此您需要执行垂直和两个对角线。 请注意,加入时应使用路径展平,以提高效率。
For row = 0 to L-2, inclusive: For col = 0 to L-1, inclusive: If (id[L*row+col] != white) And (id[L*row+L+col] != white): Join id[L*row+col] and id[L*row+L+col] End If End For End For For row = 0 to L-2, inclusive: For col = 0 to L-2, inclusive: If (id[L*row+col] != white) And (id[L*row+L+col+1] != white): Join id[L*row+col] and id[L*row+L+col+1] End If End For End For For row = 1 to L-1, inclusive: For col = 0 to L-2, inclusive: If (id[L*row+col] != white) And (id[L*row-L+col+1] != white): Join id[L*row+col] and id[L*row-L+col+1] End If End For End For
当然,您应该组合循环,以保持最佳的访问位置。 (分别执行col == 0
,并在单独的内循环中使用row == 0
,最小化if
子句,因为这些子句往往很慢。)你甚至可以使id
数组(L + 1) 2的大小,填充外部具有白色的细胞边缘,以简化上述连接成单个双环,以4L + 4个额外细胞的成本使用。
此时,您需要展平每个不相交的集合。 非white
每个ID值都是L*row+col
(表示“this”),或者是对另一个单元格的引用。 展平意味着对于每个ID,我们在链中找到最终的ID,并使用它代替:
For i = 0 to L*L-1, inclusive: If (id[i] != white): Let k = i While (id[k] != k): k = id[k] End While id[i] = k End If End For
接下来,我们需要计算具有特定ID的单元格数:
Clear idn[] For i = 0 to L*L-1, inclusive: Increment idn[id[i]] End For
因为我们对N次迭代中特定大小的簇的数量感兴趣,所以我们可以简单地通过在此迭代中找到的每个特定大小的簇更新count
数组:
For i = 1 to L*L, inclusive: Increment count[idn[i]] End For Let count[0] = 0 End For
此时count[i]
包含在N
次迭代中找到的i
单元的黑色簇的数量; 换句话说,它是在N次迭代期间看到的簇大小的直方图(离散分布)。
上述的一种实现可以如下。
(第一个实现有数组大小和集群标签的问题;这个版本将该部分拆分成一个单独的文件,并且更容易validation是否正确。但是,这只是第二个版本,所以我不认为它是生产 - 质量。)
首先,操作矩阵的函数cluster.h :
#ifndef CLUSTER_H #define CLUSTER_H #include #include #include #include #include /* This is in the public domain. Written by Nominal Animal */ typedef uint32_t cluster_label; typedef uint32_t cluster_size; static size_t rows = 0; /* Number of mutable rows */ static size_t cols = 0; /* Number of mutable columns */ static size_t nrows = 0; /* Number of accessible rows == rows + 1 */ static size_t ncols = 0; /* Number of accessible columns == cols + 1 */ static size_t cells = 0; /* Total number of cells == nrows*ncols */ static size_t empty = 0; /* Highest-numbered label == nrows*ncols - 1 */ static size_t sizes = 0; /* Number of possible cluster sizes == rows*cols + 1 */ #define INDEX(row, col) (ncols*(row) + (col)) cluster_label *label = NULL; /* 2D contents: label[ncols*(row) + (col)] */ cluster_size *occurs = NULL; /* Number of occurrences of each label value */ /* Xorshift64* PRNG used for generating the matrix. */ static uint64_t prng_state = 1; static inline uint64_t prng_u64(void) { uint64_t state = prng_state; state ^= state >> 12; state ^= state << 25; state ^= state >> 27; prng_state = state; return state * UINT64_C(2685821657736338717); } static inline uint64_t prng_u64_less(void) { uint64_t result; do { result = prng_u64(); } while (result == UINT64_C(18446744073709551615)); return result; } static inline uint64_t prng_seed(const uint64_t seed) { if (seed) prng_state = seed; else prng_state = 1; return prng_state; } static inline uint64_t prng_randomize(void) { int discard = 1024; uint64_t seed; FILE *in; /* By default, use time. */ prng_seed( ((uint64_t)time(NULL) * 2832631) ^ ((uint64_t)clock() * 1120939) ); #ifdef __linux__ /* On Linux, read from /dev/urandom. */ in = fopen("/dev/urandom", "r"); if (in) { if (fread(&seed, sizeof seed, 1, in) == 1) prng_seed(seed); fclose(in); } #endif /* Churn the state a bit. */ seed = prng_state; while (discard-->0) { seed ^= seed >> 12; seed ^= seed << 25; seed ^= seed >> 27; } return prng_state = seed; } static inline void cluster_free(void) { free(occurs); occurs = NULL; free(label); label = NULL; rows = 0; cols = 0; nrows = 0; ncols = 0; cells = 0; empty = 0; sizes = 0; } static int cluster_init(const size_t want_cols, const size_t want_rows) { cluster_free(); if (want_cols < 1 || want_rows < 1) return -1; /* Invalid number of rows or columns */ rows = want_rows; cols = want_cols; nrows = rows + 1; ncols = cols + 1; cells = nrows * ncols; if ((size_t)(cells / nrows) != ncols || nrows <= rows || ncols <= cols) return -1; /* Size overflows */ empty = nrows * ncols - 1; sizes = rows * cols + 1; label = calloc(cells, sizeof label[0]); occurs = calloc(cells, sizeof occurs[0]); if (!label || !occurs) { cluster_free(); return -1; /* Not enough memory. */ } return 0; } static void join2(size_t i1, size_t i2) { size_t root1 = i1, root2 = i2, root; while (root1 != label[root1]) root1 = label[root1]; while (root2 != label[root2]) root2 = label[root2]; root = root1; if (root > root2) root = root2; while (label[i1] != root) { const size_t temp = label[i1]; label[i1] = root; i1 = temp; } while (label[i2] != root) { const size_t temp = label[i2]; label[i2] = root; i2 = temp; } } static void join3(size_t i1, size_t i2, size_t i3) { size_t root1 = i1, root2 = i2, root3 = i3, root; while (root1 != label[root1]) root1 = label[root1]; while (root2 != label[root2]) root2 = label[root2]; while (root3 != label[root3]) root3 = label[root3]; root = root1; if (root > root2) root = root2; if (root > root3) root = root3; while (label[i1] != root) { const size_t temp = label[i1]; label[i1] = root; i1 = temp; } while (label[i2] != root) { const size_t temp = label[i2]; label[i2] = root; i2 = temp; } while (label[i3] != root) { const size_t temp = label[i3]; label[i3] = root; i3 = temp; } } static void join4(size_t i1, size_t i2, size_t i3, size_t i4) { size_t root1 = i1, root2 = i2, root3 = i3, root4 = i4, root; while (root1 != label[root1]) root1 = label[root1]; while (root2 != label[root2]) root2 = label[root2]; while (root3 != label[root3]) root3 = label[root3]; while (root4 != label[root4]) root4 = label[root4]; root = root1; if (root > root2) root = root2; if (root > root3) root = root3; if (root > root4) root = root4; while (label[i1] != root) { const size_t temp = label[i1]; label[i1] = root; i1 = temp; } while (label[i2] != root) { const size_t temp = label[i2]; label[i2] = root; i2 = temp; } while (label[i3] != root) { const size_t temp = label[i3]; label[i3] = root; i3 = temp; } while (label[i4] != root) { const size_t temp = label[i4]; label[i4] = root; i4 = temp; } } static void cluster_fill(const uint64_t plimit) { size_t r, c; /* Fill the matrix with the specified probability. For efficiency, we use the same label for all horizontally consecutive cells. */ for (r = 0; r < rows; r++) { const size_t imax = ncols*r + cols; cluster_label last; size_t i = ncols*r; last = (prng_u64_less() < plimit) ? i : empty; label[i] = last; while (++i < imax) { if (prng_u64_less() < plimit) { if (last == empty) last = i; } else last = empty; label[i] = last; } label[imax] = empty; } /* Fill the extra row with empty, too. */ { cluster_label *ptr = label + ncols*rows; cluster_label *const end = label + ncols*nrows; while (ptr < end) *(ptr++) = empty; } /* On the very first row, we need to join non-empty cells vertically and diagonally down. */ for (c = 0; c < cols; c++) switch ( ((label[c] < empty) << 0) | ((label[c+ncols] < empty) << 1) | ((label[c+ncols+1] < empty) << 2) ) { /* <1> * * 2 4 */ case 3: /* Join down */ join2(c, c+ncols); break; case 5: /* Join down right */ join2(c, c+ncols+1); break; case 7: /* Join down and down right */ join3(c, c+ncols, c+ncols+1); break; } /* On the other rows, we need to join non-empty cells vertically, diagonally down, and diagonally up. */ for (r = 1; r < rows; r++) { const size_t imax = ncols*r + cols; size_t i; for (i = ncols*r; i < imax; i++) switch ( ((label[i] < empty) << 0) | ((label[i-ncols+1] < empty) << 1) | ((label[i+ncols] < empty) << 2) | ((label[i+ncols+1] < empty) << 3) ) { /* 2 * * <1> * * 4 8 */ case 3: /* Diagonally up */ join2(i, i-ncols+1); break; case 5: /* Down */ join2(i, i+ncols); break; case 7: /* Down and diagonally up */ join3(i, i-ncols+1, i+ncols); break; case 9: /* Diagonally down */ join2(i, i+ncols+1); break; case 11: /* Diagonally up and diagonally down */ join3(i, i-ncols+1, i+ncols+1); break; case 13: /* Down and diagonally down */ join3(i, i+ncols, i+ncols+1); break; case 15: /* Down, diagonally down, and diagonally up */ join4(i, i-ncols+1, i+ncols, i+ncols+1); break; } } /* Flatten the labels, so that all cells belonging to the same cluster will have the same label. */ { const size_t imax = ncols*rows; size_t i; for (i = 0; i < imax; i++) if (label[i] < empty) { size_t k = i, kroot = i; while (kroot != label[kroot]) kroot = label[kroot]; while (label[k] != kroot) { const size_t temp = label[k]; label[k] = kroot; k = temp; } } } /* Count the number of occurrences of each label. */ memset(occurs, 0, (empty + 1) * sizeof occurs[0]); { cluster_label *const end = label + ncols*rows; cluster_label *ptr = label; while (ptr < end) ++occurs[*(ptr++)]; } } #endif /* CLUSTER_H */
请注意,要获得完整的概率范围, prng_u64_less()
函数永远不会返回尽可能高的uint64_t
。 然而,这种做法有点过分,精确。
main.c解析输入参数,迭代并打印结果:
#include #include #include #include "cluster.h" /* This program is in the public domain. Written by Nominal Animal */ int main(int argc, char *argv[]) { uint64_t *count = NULL; uint64_t plimit; unsigned long size, iterations, i; double probability; char dummy; if (argc != 4) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: %s SIZE ITERATIONS PROBABILITY\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "Where SIZE sets the matrix size (SIZE x SIZE),\n"); fprintf(stderr, " ITERATIONS is the number of iterations, and\n"); fprintf(stderr, " PROBABILITY is the probability [0, 1] of a cell\n"); fprintf(stderr, " being non-empty.\n"); fprintf(stderr, "\n"); return EXIT_FAILURE; } if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) { fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]); return EXIT_FAILURE; } if (sscanf(argv[2], " %lu %c", &iterations, &dummy) != 1 || iterations < 1u) { fprintf(stderr, "%s: Invalid number of iterations.\n", argv[2]); return EXIT_FAILURE; } if (sscanf(argv[3], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) { fprintf(stderr, "%s: Invalid probability.\n", argv[3]); return EXIT_FAILURE; } /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p), but doubles do not have the precision for that. */ if (probability > 0.9999999999999999) plimit = UINT64_C(18446744073709551615); else if (probability <= 0) plimit = UINT64_C(0); else plimit = (uint64_t)(18446744073709551616.0 * probability); /* Try allocating memory for the matrix and the label count array. */ if (size > 2147483646u || cluster_init(size, size)) { fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]); return EXIT_FAILURE; } /* Try allocating memory for the cluster size histogram. */ count = calloc(sizes + 1, sizeof count[0]); if (!count) { fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]); return EXIT_FAILURE; } printf("# Xorshift64* seed is %" PRIu64 "\n", prng_randomize()); printf("# Matrix size is %lu x %lu\n", size, size); printf("# Probability of a cell to be non-empty is %.6f%%\n", 100.0 * ((double)plimit / 18446744073709551615.0)); printf("# Collecting statistics over %lu iterations\n", iterations); printf("# Size Count CountPerIteration\n"); fflush(stdout); for (i = 0u; i < iterations; i++) { size_t k = empty; cluster_fill(plimit); /* Add cluster sizes to the histogram. */ while (k-->0) ++count[occurs[k]]; } /* Print the histogram of cluster sizes. */ { size_t k = 1; printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations); for (k = 2; k < sizes; k++) if (count[k-1] != 0 || count[k] != 0 || count[k+1] != 0) printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations); } return EXIT_SUCCESS; }
程序以Gnuplot兼容格式输出簇大小分布(直方图),从以#
开头的几条注释行开始,以指示用于计算结果的确切参数。
输出中的第一列是簇大小(簇中的单元格数),第二列是迭代期间找到的此类簇的精确计数,第三列是观察到的发生概率(即,总计数)出现次数除以迭代次数)。
以下是L = 1000,N = 1000的簇大小分布与p的几个不同值相似: 请注意,两个轴都具有对数缩放,因此它是对数 - 对数图。 它是通过运行上述程序几次生成的,每个唯一的p
运行一次,并将输出保存到文件(包含p
值的名称),然后将它们打印到Gnuplot中的单个图中。
为了validation聚类function,我编写了一个测试程序,它生成一个矩阵,为每个聚类分配随机颜色,并将其输出为PPM图像(可以使用NetPBM工具或基本上任何图像处理程序进行操作); ppm.c :
#include #include #include #include "cluster.h" /* This program is in the public domain. Written by Nominal Animal */ int main(int argc, char *argv[]) { uint64_t plimit; uint32_t *color; unsigned long size, r, c; double probability; char dummy; if (argc != 3) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: %s SIZE PROBABILITY > matrix.ppm\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "Where SIZE sets the matrix size (SIZE x SIZE),\n"); fprintf(stderr, " PROBABILITY is the probability [0, 1] of a cell\n"); fprintf(stderr, " being non-empty.\n"); fprintf(stderr, "\n"); return EXIT_FAILURE; } if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) { fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]); return EXIT_FAILURE; } if (sscanf(argv[2], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) { fprintf(stderr, "%s: Invalid probability.\n", argv[2]); return EXIT_FAILURE; } /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p), but doubles do not have the precision for that. */ if (probability > 0.9999999999999999) plimit = UINT64_C(18446744073709551615); else if (probability <= 0) plimit = UINT64_C(0); else plimit = (uint64_t)(18446744073709551616.0 * probability); /* Try allocating memory for the matrix and the label count array. */ if (size > 2147483646u || cluster_init(size, size)) { fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]); return EXIT_FAILURE; } /* Try allocating memory for the color lookup array. */ color = calloc(empty + 1, sizeof color[0]); if (!color) { fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]); return EXIT_FAILURE; } fprintf(stderr, "Using Xorshift64* seed %" PRIu64 "\n", prng_randomize()); fflush(stderr); /* Construct the matrix. */ cluster_fill(plimit); { size_t i; color[empty] = 0xFFFFFFu; /* Assign random colors. */ for (i = 0; i < empty; i++) color[i] = prng_u64() >> 40; } printf("P6\n%lu %lu 255\n", size, size); for (r = 0; r < rows; r++) for (c = 0; c < cols; c++) { const uint32_t rgb = color[label[ncols*r + c]]; fputc((rgb >> 16) & 255, stdout); fputc((rgb >> 8) & 255, stdout); fputc( rgb & 255, stdout); } fflush(stdout); fprintf(stderr, "Done.\n"); return EXIT_SUCCESS; }
这是p = 40%的256x256矩阵看起来像:
请注意,白色背景不计为群集。
这是我的另一个答案的补充,显示了一个分别计算白色(0)和黑色(1)聚类的示例程序。 这太长了,不适合单一的答案。
我们的想法是,我们使用单独的地图来表示每个单元格的颜色。 此贴图还包含边框单元格,其颜色与实际矩阵中的任何颜色都不匹配。 不相交集是一个单独的数组,每个矩阵单元有一个条目。
为了跟踪这两种颜色,我们在计算每个不相交集合根的出现次数时,为每种颜色使用单独的数组。 (或者换句话说,矩阵中每个簇的大小。)
收集统计信息时,群集大小将更新为每种颜色的单独直方图。 即使内存消耗很大,这种方案似乎也非常有效。 ( R
行和C
列的矩阵需要大约25*R*C + R + 2*C
(加上常量)字节,包括直方图。)
这是头文件clusters.h ,它实现了所有的计算逻辑:
#ifndef CLUSTERS_H #define CLUSTERS_H #include #include #include #include /* This file is in public domain. No guarantees, no warranties. Written by Nominal Animal . */ /* For pure C89 compilers, use '-DSTATIC_INLINE=static' at compile time. */ #ifndef STATIC_INLINE #define STATIC_INLINE static inline #endif #define ERR_INVALID -1 /* Invalid function parameter */ #define ERR_TOOLARGE -2 /* Matrix size is too large */ #define ERR_NOMEM -3 /* Out of memory */ typedef unsigned char cluster_color; typedef uint32_t cluster_label; typedef uint64_t cluster_count; #define CLUSTER_WHITE 0 #define CLUSTER_BLACK 1 #define CLUSTER_NONE UCHAR_MAX /* Reserved */ #define FMT_COLOR "u" #define FMT_LABEL PRIu32 #define FMT_COUNT PRIu64 typedef struct { uint64_t state; } prng; typedef struct { /* Pseudo-random number generator used */ prng rng; /* Actual size of the matrix */ cluster_label rows; cluster_label cols; /* Number of matrices the histograms have been collected from */ cluster_count iterations; /* Probability of each cell being black */ uint64_t p_black; /* Probability of diagonal connections */ uint64_t d_black; uint64_t d_white; /* Cluster colormap contains (rows+2) rows and (cols+1) columns */ cluster_color *map; /* Disjoint set of (rows) rows and (cols) columns */ cluster_label *djs; /* Number of occurrences per disjoint set root */ cluster_label *white_roots; cluster_label *black_roots; /* Histograms of white and black clusters */ cluster_count *white_histogram; cluster_count *black_histogram; } cluster; #define CLUSTER_INITIALIZER { {0}, 0, 0, 0, 0.0, 0.0, 0.0, NULL, NULL, NULL, NULL, NULL, NULL } /* Calculate uint64_t limit corresponding to probability p. */ STATIC_INLINE uint64_t probability_limit(const double p) { if (p <= 0.0) return UINT64_C(0); else if (p <= 0.5) return (uint64_t)(p * 18446744073709551615.0); else if (p >= 1.0) return UINT64_C(18446744073709551615); else return UINT64_C(18446744073709551615) - (uint64_t)((double)(1.0 - p) * 18446744073709551615.0); } /* Return true at probability corresponding to limit 'limit'. This implements a Xorshift64* pseudo-random number generator. */ STATIC_INLINE int probability(prng *const rng, const uint64_t limit) { uint64_t state = rng->state; uint64_t value; /* To correctly cover the entire range, we ensure we never generate a zero. */ do { state ^= state >> 12; state ^= state << 25; state ^= state >> 27; value = state * UINT64_C(2685821657736338717); } while (!value); rng->state = state; return (value <= limit) ? CLUSTER_BLACK : CLUSTER_WHITE; } /* Generate a random seed for the Xorshift64* pseudo-random number generator. */ static uint64_t randomize(prng *const rng) { unsigned int rounds = 127; uint64_t state = UINT64_C(3069887672279) * (uint64_t)time(NULL) ^ UINT64_C(60498839) * (uint64_t)clock(); if (!state) state = 1; /* Churn the state a bit. */ while (rounds-->0) { state ^= state >> 12; state ^= state << 25; state ^= state >> 27; } if (rng) rng->state = state; return state; } /* Free all resources related to a cluster. */ STATIC_INLINE void free_cluster(cluster *c) { if (c) { /* Free dynamically allocated pointers. Note: free(NULL) is safe. */ free(c->map); free(c->djs); free(c->white_roots); free(c->black_roots); free(c->white_histogram); free(c->black_histogram); c->rng.state = 0; c->rows = 0; c->cols = 0; c->iterations = 0; c->p_black = 0; c->d_white = 0; c->d_black = 0; c->map = NULL; c->djs = NULL; c->white_roots = 0; c->black_roots = 0; c->white_histogram = NULL; c->black_histogram = NULL; } } /* Initialize cluster structure, for a matrix of specified size. */ static int init_cluster(cluster *c, const int rows, const int cols, const double p_black, const double d_white, const double d_black) { const cluster_label label_cols = cols; const cluster_label label_rows = rows; const cluster_label color_rows = rows + 2; const cluster_label color_cols = cols + 1; const cluster_label color_cells = color_rows * color_cols; const cluster_label label_cells = label_rows * label_cols; const cluster_label labels = label_cells + 2; /* One extra! */ if (!c) return ERR_INVALID; c->rng.state = 0; /* Invalid seed for Xorshift64*. */ c->rows = 0; c->cols = 0; c->iterations = 0; c->p_black = 0; c->d_white = 0; c->d_black = 0; c->map = NULL; c->djs = NULL; c->white_roots = NULL; c->black_roots = NULL; c->white_histogram = NULL; c->black_histogram = NULL; if (rows < 1 || cols < 1) return ERR_INVALID; if ((unsigned int)color_rows <= (unsigned int)rows || (unsigned int)color_cols <= (unsigned int)cols || (cluster_label)(color_cells / color_rows) != color_cols || (cluster_label)(color_cells / color_cols) != color_rows || (cluster_label)(label_cells / label_rows) != label_cols || (cluster_label)(label_cells / label_cols) != label_rows) return ERR_TOOLARGE; c->black_histogram = calloc(labels, sizeof (cluster_count)); c->white_histogram = calloc(labels, sizeof (cluster_count)); c->black_roots = calloc(labels, sizeof (cluster_label)); c->white_roots = calloc(labels, sizeof (cluster_label)); c->djs = calloc(label_cells, sizeof (cluster_label)); c->map = calloc(color_cells, sizeof (cluster_color)); if (!c->map || !c->djs || !c->white_roots || !c->black_roots || !c->white_histogram || !c->black_histogram) { free(c->map); free(c->djs); free(c->white_roots); free(c->black_roots); free(c->white_histogram); free(c->black_histogram); return ERR_NOMEM; } c->rows = rows; c->cols = cols; c->p_black = probability_limit(p_black); c->d_white = probability_limit(d_white); c->d_black = probability_limit(d_black); /* Initialize the color map to NONE. */ { cluster_color *ptr = c->map; cluster_color *const end = c->map + color_cells; while (ptr < end) *(ptr++) = CLUSTER_NONE; } /* calloc() initialized the other arrays to zeros already. */ return 0; } /* Disjoint set: find root. */ STATIC_INLINE cluster_label djs_root(const cluster_label *const djs, cluster_label from) { while (from != djs[from]) from = djs[from]; return from; } /* Disjoint set: path compression. */ STATIC_INLINE void djs_path(cluster_label *const djs, cluster_label from, const cluster_label root) { while (from != root) { const cluster_label temp = djs[from]; djs[from] = root; from = temp; } } /* Disjoint set: Flatten. Returns the root, and flattens the path to it. */ STATIC_INLINE cluster_label djs_flatten(cluster_label *const djs, cluster_label from) { const cluster_label root = djs_root(djs, from); djs_path(djs, from, root); return root; } /* Disjoint set: Join two subsets. */ STATIC_INLINE void djs_join2(cluster_label *const djs, cluster_label from1, cluster_label from2) { cluster_label root, temp; root = djs_root(djs, from1); temp = djs_root(djs, from2); if (root > temp) temp = root; djs_path(djs, from1, root); djs_path(djs, from2, root); } /* Disjoint set: Join three subsets. */ STATIC_INLINE void djs_join3(cluster_label *const djs, cluster_label from1, cluster_label from2, cluster_label from3) { cluster_label root, temp; root = djs_root(djs, from1); temp = djs_root(djs, from2); if (root > temp) root = temp; temp = djs_root(djs, from3); if (root > temp) root = temp; djs_path(djs, from1, root); djs_path(djs, from2, root); djs_path(djs, from3, root); } /* Disjoint set: Join four subsets. */ STATIC_INLINE void djs_join4(cluster_label *const djs, cluster_label from1, cluster_label from2, cluster_label from3, cluster_label from4) { cluster_label root, temp; root = djs_root(djs, from1); temp = djs_root(djs, from2); if (root > temp) root = temp; temp = djs_root(djs, from3); if (root > temp) root = temp; temp = djs_root(djs, from4); if (root > temp) root = temp; djs_path(djs, from1, root); djs_path(djs, from2, root); djs_path(djs, from3, root); djs_path(djs, from4, root); } /* Disjoint set: Join five subsets. */ STATIC_INLINE void djs_join5(cluster_label *const djs, cluster_label from1, cluster_label from2, cluster_label from3, cluster_label from4, cluster_label from5) { cluster_label root, temp; root = djs_root(djs, from1); temp = djs_root(djs, from2); if (root > temp) root = temp; temp = djs_root(djs, from3); if (root > temp) root = temp; temp = djs_root(djs, from4); if (root > temp) root = temp; temp = djs_root(djs, from5); if (root > temp) root = temp; djs_path(djs, from1, root); djs_path(djs, from2, root); djs_path(djs, from3, root); djs_path(djs, from4, root); djs_path(djs, from5, root); } static void iterate(cluster *const cl) { prng *const rng = &(cl->rng); uint64_t const p_black = cl->p_black; uint64_t d_color[2]; cluster_color *const map = cl->map + cl->cols + 2; cluster_label const map_stride = cl->cols + 1; cluster_label *const djs = cl->djs; cluster_label *roots[2]; cluster_label const rows = cl->rows; cluster_label const cols = cl->cols; int r, c; d_color[CLUSTER_WHITE] = cl->d_white; d_color[CLUSTER_BLACK] = cl->d_black; roots[CLUSTER_WHITE] = cl->white_roots; roots[CLUSTER_BLACK] = cl->black_roots; for (r = 0; r < rows; r++) { cluster_label const curr_i = r * cols; cluster_color *const curr_row = map + r * map_stride; cluster_color *const prev_row = curr_row - map_stride; for (c = 0; c < cols; c++) { cluster_color color = probability(rng, p_black); cluster_label label = curr_i + c; uint64_t diag = d_color[color]; unsigned int joins = 0; /* Assign the label and color of the current cell, */ djs[label] = label; curr_row[c] = color; /* Because we join left, up-left, up, and up-right, and all those have been assigned to already, we can do the necessary joins right now. */ /* Join left? */ joins |= (curr_row[c-1] == color) << 0; /* Join up? */ joins |= (prev_row[c ] == color) << 1; /* Join up left? */ joins |= (prev_row[c-1] == color && probability(rng, diag)) << 2; /* Join up right? */ joins |= (prev_row[c+1] == color && probability(rng, diag)) << 3; /* Do the corresponding joins. */ switch (joins) { case 1: /* Left */ djs_join2(djs, label, label - 1); break; case 2: /* Up */ djs_join2(djs, label, label - cols); break; case 3: /* Left and up */ djs_join3(djs, label, label - 1, label - cols); break; case 4: /* Up-left */ djs_join2(djs, label, label - cols - 1); break; case 5: /* Left and up-left */ djs_join3(djs, label, label - 1, label - cols - 1); break; case 6: /* Up and up-left */ djs_join3(djs, label, label - cols, label - cols - 1); break; case 7: /* Left, up, and up-left */ djs_join4(djs, label, label - 1, label - cols, label - cols - 1); break; case 8: /* Up-right */ djs_join2(djs, label, label - cols + 1); break; case 9: /* Left and up-right */ djs_join3(djs, label, label - 1, label - cols + 1); break; case 10: /* Up and up-right */ djs_join3(djs, label, label - cols, label - cols + 1); break; case 11: /* Left, up, and up-right */ djs_join4(djs, label, label - 1, label - cols, label - cols + 1); break; case 12: /* Up-left and up-right */ djs_join3(djs, label, label - cols - 1, label - cols + 1); break; case 13: /* Left, up-left, and up-right */ djs_join4(djs, label, label - 1, label - cols - 1, label - cols + 1); break; case 14: /* Up, up-left, and up-right */ djs_join4(djs, label, label - cols, label - cols - 1, label - cols + 1); break; case 15: /* Left, up, up-left, and up-right */ djs_join5(djs, label, label - 1, label - cols, label - cols - 1, label - cols + 1); break; } } } /* Count the occurrences of each disjoint-set root label. */ if (roots[0] && roots[1]) { const size_t labels = rows * cols + 2; /* Clear the counts. */ memset(roots[0], 0, labels * sizeof (cluster_label)); memset(roots[1], 0, labels * sizeof (cluster_label)); for (r = 0; r < rows; r++) { const cluster_color *const curr_row = map + r * map_stride; const cluster_label curr_i = r * cols; for (c = 0; c < cols; c++) roots[curr_row[c]][djs_flatten(djs, curr_i + c)]++; } } else { size_t i = rows * cols; while (i-->0) djs_flatten(djs, i); } /* Collect the statistics. */ if (cl->white_histogram && roots[0]) { const cluster_label *const root_count = roots[0]; cluster_count *const histogram = cl->white_histogram; size_t i = rows * cols + 1; while (i-->0) histogram[root_count[i]]++; } if (cl->black_histogram && roots[1]) { const cluster_label *const root_count = roots[1]; cluster_count *const histogram = cl->black_histogram; size_t i = rows * cols + 1; while (i-->0) histogram[root_count[i]]++; } /* Note: index zero and (rows*cols+1) are zero in the histogram, for ease of scanning. */ if (cl->white_histogram || cl->black_histogram) { const size_t n = rows * cols + 1; if (cl->white_histogram) { cl->white_histogram[0] = 0; cl->white_histogram[n] = 0; } if (cl->black_histogram) { cl->black_histogram[0] = 0; cl->black_histogram[n] = 0; } } /* One more iteration performed. */ cl->iterations++; } #endif /* CLUSTERS_H */
Here is an example program, distribution.c , that collects cluster size statistics over a number of iterations, and outputs them in format suitable for plotting with for example Gnuplot. Run it without arguments to see usage and help.
#include #include #include #include #include "clusters.h" /* This file is in public domain. No guarantees, no warranties. Written by Nominal Animal . */ #define DEFAULT_ROWS 100 #define DEFAULT_COLS 100 #define DEFAULT_P_BLACK 0.0 #define DEFAULT_D_WHITE 0.0 #define DEFAULT_D_BLACK 0.0 #define DEFAULT_ITERS 1 int usage(const char *argv0) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv0); fprintf(stderr, " %s OPTIONS [ > output.txt ]\n", argv0); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " rows=SIZE Set number of rows. Default is %d.\n", DEFAULT_ROWS); fprintf(stderr, " cols=SIZE Set number of columns. Default is %d.\n", DEFAULT_ROWS); fprintf(stderr, " L=SIZE Set rows=SIZE and cols=SIZE.\n"); fprintf(stderr, " black=P Set the probability of a cell to be black. Default is %g.\n", DEFAULT_P_BLACK); fprintf(stderr, " All non-black cells are white.\n"); fprintf(stderr, " dwhite=P Set the probability of white cells connecting diagonally.\n"); fprintf(stderr, " Default is %g.\n", DEFAULT_D_WHITE); fprintf(stderr, " dblack=P Set the probability of black cells connecting diagonally.\n"); fprintf(stderr, " Default is %g.\n", DEFAULT_D_BLACK); fprintf(stderr, " N=COUNT Number of iterations for gathering statistics. Default is %d.\n", DEFAULT_ITERS); fprintf(stderr, " seed=U64 Set the Xorshift64* pseudorandom number generator seed; nonzero.\n"); fprintf(stderr, " Default is to pick one randomly (based on time).\n"); fprintf(stderr, "\n"); fprintf(stderr, "The output consists of comment lines and data lines.\n"); fprintf(stderr, "Comment lines begin with a #:\n"); fprintf(stderr, " # This is a comment line.\n"); fprintf(stderr, "Each data line contains a cluster size, the number of white clusters of that size\n"); fprintf(stderr, "observed during iterations, the number of black clusters of that size observed\n"); fprintf(stderr, "during iterations, and the number of any clusters of that size observed:\n"); fprintf(stderr, " SIZE WHITE_CLUSTERS BLACK_CLUSTERS TOTAL_CLUSTERS\n"); fprintf(stderr, "\n"); return EXIT_SUCCESS; } int main(int argc, char *argv[]) { int rows = DEFAULT_ROWS; int cols = DEFAULT_COLS; double p_black = DEFAULT_P_BLACK; double d_white = DEFAULT_D_WHITE; double d_black = DEFAULT_D_BLACK; long iters = DEFAULT_ITERS; uint64_t seed = 0; cluster c = CLUSTER_INITIALIZER; int arg, itemp; uint64_t u64temp; double dtemp; long ltemp; char dummy; size_t n; size_t i; if (argc < 2) return usage(argv[0]); for (arg = 1; arg < argc; arg++) if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "/?") || !strcmp(argv[arg], "--help")) return usage(argv[0]); else if (sscanf(argv[arg], "L=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "l=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "size=%d %c", &itemp, &dummy) == 1) { rows = itemp; cols = itemp; } else if (sscanf(argv[arg], "seed=%" SCNu64 " %c", &u64temp, &dummy) == 1 || sscanf(argv[arg], "seed=%" SCNx64 " %c", &u64temp, &dummy) == 1 || sscanf(argv[arg], "s=%" SCNu64 " %c", &u64temp, &dummy) == 1 || sscanf(argv[arg], "s=%" SCNx64 " %c", &u64temp, &dummy) == 1) { seed = u64temp; } else if (sscanf(argv[arg], "N=%ld %c", <emp, &dummy) == 1 || sscanf(argv[arg], "n=%ld %c", <emp, &dummy) == 1 || sscanf(argv[arg], "count=%ld %c", <emp, &dummy) == 1) { iters = ltemp; } else if (sscanf(argv[arg], "rows=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "r=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "height=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "h=%d %c", &itemp, &dummy) == 1) { rows = itemp; } else if (sscanf(argv[arg], "columns=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "cols=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "c=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "width=%d %c", &itemp, &dummy) == 1 || sscanf(argv[arg], "w=%d %c", &itemp, &dummy) == 1) { cols = itemp; } else if (sscanf(argv[arg], "black=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "p0=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "b=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "P=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "p0=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "p=%lf %c", &dtemp, &dummy) == 1) { p_black = dtemp; } else if (sscanf(argv[arg], "white=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "p1=%lf %c", &dtemp, &dummy) == 1) { p_black = 1.0 - dtemp; } else if (sscanf(argv[arg], "dwhite=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "dw=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "d0=%lf %c", &dtemp, &dummy) == 1) { d_white = dtemp; } else if (sscanf(argv[arg], "dblack=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "db=%lf %c", &dtemp, &dummy) == 1 || sscanf(argv[arg], "d1=%lf %c", &dtemp, &dummy) == 1) { d_black = dtemp; } else { fprintf(stderr, "%s: Unknown option.\n", argv[arg]); return EXIT_FAILURE; } switch (init_cluster(&c, rows, cols, p_black, d_white, d_black)) { case 0: break; /* OK */ case ERR_INVALID: fprintf(stderr, "Invalid size.\n"); return EXIT_FAILURE; case ERR_TOOLARGE: fprintf(stderr, "Size is too large.\n"); return EXIT_FAILURE; case ERR_NOMEM: fprintf(stderr, "Not enough memory.\n"); return EXIT_FAILURE; } if (!seed) seed = randomize(NULL); c.rng.state = seed; /* The largest possible cluster has n cells. */ n = (size_t)rows * (size_t)cols; /* Print the comments describing the initial parameters. */ printf("# seed: %" PRIu64 " (Xorshift 64*)\n", seed); printf("# size: %d rows, %d columns\n", rows, cols); printf("# P(black): %.6f (%" PRIu64 "/18446744073709551615)\n", p_black, c.p_black); printf("# P(black connected diagonally): %.6f (%" PRIu64 "/18446744073709551615)\n", d_black, c.d_black); printf("# P(white connected diagonally): %.6f (%" PRIu64 "/18446744073709551615)\n", d_white, c.d_white); fflush(stdout); while (iters-->0) iterate(&c); printf("# Iterations: %" PRIu64 "\n", c.iterations); printf("#\n"); printf("# size white_clusters(size) black_clusters(size) clusters(size)\n"); /* Note: c._histogram[0] == c._histogram[n] == 0, for ease of scanning. */ for (i = 1; i <= n; i++) if (c.white_histogram[i-1] || c.white_histogram[i] || c.white_histogram[i+1] || c.black_histogram[i-1] || c.black_histogram[i] || c.black_histogram[i+1]) printf("%lu %" FMT_COUNT " %" FMT_COUNT " %" FMT_COUNT "\n", (unsigned long)i, c.white_histogram[i], c.black_histogram[i], c.white_histogram[i]+c.black_histogram[i]); /* Since we are exiting anyway, this is not really necessary. */ free_cluster(&c); /* All done. */ return EXIT_SUCCESS; }
Here is a plot generated by running distribution.c with L=100 black=0.5 dwhite=0 dblack=0.25 seed=20120622 N=100000
(cluster size distributions collected from hundred thousand 100x100 matrices, where the probability of a nonzero cell is 50%, and there is a 25% probability of nonzero cells connecting diagonally, zero cells never connecting diagonally, only horizontally and vertically): As you can see, because nonzero clusters can sometimes connect diagonally too, there are more larger nonzero clusters than zero clusters. The computation took about 37 seconds on an i5-7200U laptop. Because the Xorshift64* seed is given explicitly, this is a repeatable test.