MPI中的进程数量是否有限制?

我正在阅读“使用MPI”并尝试自己执行代码。 第6.3章中有一个网格分解代码。 它编译时没有任何警告或错误,并且使用少量进程运行,但在我的笔记本电脑上失败,但数量更大,比如30。 我的笔记本电脑是4核,超线程和8G内存。 la_grid_2d_new两个版本都不起作用,但第一个版本可以容忍更大的数字,比如35,但是40个进程失败了。 我不知道为什么。 请问你能帮帮我吗? 非常感谢。

 #include  #include  #include  typedef struct { int P, Q; int p, q; MPI_Comm grid_comm; MPI_Comm row_comm; MPI_Comm col_comm; } LA_Grid_2d; LA_Grid_2d *la_grid_2d_new(MPI_Comm comm, int P, int Q) { LA_Grid_2d *grid; MPI_Comm row, col; int my_rank, p, q; MPI_Comm_rank(comm, &my_rank); p=my_rank/Q; q=my_rank%Q; MPI_Comm_split(comm, p, q, &row); MPI_Comm_split(comm, q, p, &col); grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d)); grid->grid_comm=comm; grid->row_comm=row; grid->col_comm=col; grid->P=P; grid->Q=Q; grid->p=p; grid->q=q; return grid; } LA_Grid_2d *la_grid_2d_new_II(MPI_Comm comm, int P, int Q) { LA_Grid_2d *grid; MPI_Comm comm_2d, row, col; int my_rank, p, q; int dims[2]={P,Q}, local[2], period[2]={0,0}, remain_dims[2]; MPI_Cart_create(comm, 2, dims, period, 1, &comm_2d); MPI_Comm_rank(comm, &my_rank); MPI_Cart_coords(comm_2d, my_rank, 2, local); p=local[0]; q=local[1]; remain_dims[0]=0; remain_dims[1]=1; MPI_Cart_sub(comm_2d, remain_dims, &row); remain_dims[0]=1; remain_dims[1]=0; MPI_Cart_sub(comm_2d, remain_dims, &col); grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d)); grid->grid_comm=comm; grid->row_comm=row; grid->col_comm=col; grid->P=P; grid->Q=Q; grid->p=p; grid->q=q; return grid; } void la_grid_2d_delete(LA_Grid_2d *grid) { free(grid); } int main(int argc, char **argv) { LA_Grid_2d *pgrid; int size, rank, dims[2]={0,0}, row, col; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if(rank==0) printf("size=%d rank=%d\n", size, rank); MPI_Dims_create(size, 2, dims); // pgrid=la_grid_2d_new(MPI_COMM_WORLD, dims[0], dims[1]); pgrid=la_grid_2d_new_II(MPI_COMM_WORLD, dims[0], dims[1]); if(rank==0) printf("dims[0]=%d dims[1]=%d\n", dims[0], dims[1]); MPI_Reduce(&rank, &row, 1, MPI_INT, MPI_SUM, 0, pgrid->row_comm); MPI_Reduce(&rank, &col, 1, MPI_INT, MPI_SUM, 0, pgrid->col_comm); la_grid_2d_delete(pgrid); MPI_Finalize(); if(rank==0) printf("row=%d col=%d\n", row, col); return 0; } 

错误消息是:

 shuang@phoebe:~/usingMPI$ mpiexec -n 20 ./grid size=20 rank=0 dims[0]=5 dims[1]=4 row=6 col=40 shuang@phoebe:~/usingMPI$ mpiexec -n 30 ./grid size=30 rank=0 dims[0]=6 dims[1]=5 [phoebe:14939] *** Process received signal *** [phoebe:14939] Signal: Floating point exception (8) [phoebe:14939] Signal code: Integer divide-by-zero (1) [phoebe:14939] Failing at address: 0x7fb1e599e6f7 [phoebe:14939] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0) [0x7fb1e5714cb0] [phoebe:14939] [ 1] /usr/lib/libmpi.so.0(mca_topo_base_cart_coords+0x57) [0x7fb1e599e6f7] [phoebe:14939] [ 2] /usr/lib/libmpi.so.0(mca_topo_base_cart_sub+0x166) [0x7fb1e599ec36] [phoebe:14939] [ 3] /usr/lib/libmpi.so.0(PMPI_Cart_sub+0xba) [0x7fb1e596f34a] [phoebe:14939] [ 4] ./grid(la_grid_2d_new_II+0xd6) [0x400df6] [phoebe:14939] [ 5] ./grid(main+0x98) [0x400f07] [phoebe:14939] [ 6] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed) [0x7fb1e536776d] [phoebe:14939] [ 7] ./grid() [0x400b99] [phoebe:14939] *** End of error message *** -------------------------------------------------------------------------- mpiexec noticed that process rank 22 with PID 14939 on node phoebe exited on signal 8 (Floating point exception). -------------------------------------------------------------------------- 

@Sean如果您想尝试另一个OpenMPI,您通常可以下载它并使用类似的东西进行编译

 ./configure --prefix=/opt/ompi-[version] make sudo make install 

由于这将安装到非标准位置(以便以后轻松删除),您需要设置LD_LIBRARY_PATH = / opt / ompi- [version] / lib并指定mpicc和mpirun的完整路径以确保调用正确的版本。 在构建过程的某个地方,它会提醒您设置LD_LIBRARY_PATH。