MPI IO读写块循环矩阵

我有一个学校项目在hpc分布式系统上进行矩阵乘法。

我需要从并行IO系统读取矩阵并使用pblacs在许多计算节点（处理器）上并行执行矩阵乘法。必须使用MPI IO命令读取数据。我知道PBlacs使用块循环分布来执行乘法。

教授没有向我们提供有关MPI IO的更多信息，而且我很难在其上找到更多的信息/资源。 具体来说，是否有方法以块循环方式从并行io系统中读取矩阵并轻松将其插入到pblacs pdgemm中？

任何指向有用资源的指针都会非常感激。我的时间有点短暂，对这个项目缺乏方向感到沮丧。

这实际上相对简单（如果你已经了解了一些关于blacs / scalapack和mpi-io的东西！）但即便如此，文档 – 甚至是在线 – 就像你发现的那样，有点差。

首先要了解的是MPI-IO，它允许您使用普通的MPI数据类型来指定文件的每个进程“视图”，然后只读取属于该视图的数据。在我们的中心，我们有关于并行IO的半天课程的幻灯片和源代码; 大约三分之一是关于MPI-IO的基础知识。这里有幻灯片，这里有示例源代码。

第二件事是MPI有一种内置的方法来创建“分布式arrays”数据类型，其中一个组合允许您布局块循环数据分布; 我在回答这个问题时的一般性讨论： mpi中的darray和子数组有什么区别？。

这意味着如果你有一个包含大矩阵的二进制文件，你可以使用MPI_Type_create_darray用mpi-io读取它，它将以块循环方式由任务分发。那么这只是做blacs或scalapack调用的问题。在我对计算科学堆栈交换的问题的答案中列出了使用scalapack psgemv进行矩阵向量乘法而不是psgemm的示例程序。

为了让您了解这些部分如何组合在一起，以下是一个简单的程序，它读入包含矩阵的二进制文件（首先是方形矩阵N的大小，然后是N ^ 2个元素），然后计算特征值和使用scalapack（新） pssyevr例程的向量。它结合了MPI-IO，darray和scalapack的东西。它在Fortran中，但函数调用在基于C语言中是相同的。

 ! ! Use MPI-IO to read a diagonal matrix distributed block-cyclically, ! use Scalapack to calculate its eigenvalues, and compare ! to expected results. ! program darray use mpi implicit none integer :: n, nb ! problem size and block size integer :: myArows, myAcols ! size of local subset of global array real :: p real, dimension(:), allocatable :: myA, myZ real, dimension(:), allocatable :: work integer, dimension(:), allocatable :: iwork real, dimension(:), allocatable :: eigenvals real, dimension(:), allocatable :: expected integer :: worksize, totwork, iworksize integer, external :: numroc ! blacs routine integer :: me, procs, icontxt, prow, pcol, myrow, mycol ! blacs data integer :: info ! scalapack return value integer, dimension(9) :: ides_a, ides_z ! scalapack array desc integer :: clock real :: calctime, iotime character(len=128) :: filename integer :: mpirank integer :: ierr integer, dimension(2) :: pdims, dims, distribs, dargs integer :: infile integer, dimension(MPI_STATUS_SIZE) :: mpistatus integer :: darray integer :: locsize, nelements integer(kind=MPI_ADDRESS_KIND) :: lb, locextent integer(kind=MPI_OFFSET_KIND) :: disp integer :: nargs integer :: m, nz ! Initialize MPI (for MPI-IO) call MPI_Init(ierr) call MPI_Comm_size(MPI_COMM_WORLD,procs,ierr) call MPI_Comm_rank(MPI_COMM_WORLD,mpirank,ierr) ! May as well get the process grid from MPI_Dims_create pdims = 0 call MPI_Dims_create(procs, 2, pdims, ierr) prow = pdims(1) pcol = pdims(2) ! get filename nargs = command_argument_count() if (nargs /= 1) then print *,'Usage: darray filename' print *,' Where filename = name of binary matrix file.' call MPI_Abort(MPI_COMM_WORLD,1,ierr) endif call get_command_argument(1, filename) ! find the size of the array we'll be using call tick(clock) call MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, infile, ierr) call MPI_File_read_all(infile,n,1,MPI_INTEGER,mpistatus,ierr) call MPI_File_close(infile,ierr) ! create the darray that will read in the data. nb = 64 if (nb > (N/prow)) nb = N/prow if (nb > (N/pcol)) nb = N/pcol dims = [n,n] distribs = [MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC] dargs = [nb, nb] call MPI_Type_create_darray(procs, mpirank, 2, dims, distribs, dargs, & pdims, MPI_ORDER_FORTRAN, MPI_REAL, darray, ierr) call MPI_Type_commit(darray,ierr) call MPI_Type_size(darray, locsize, ierr) nelements = locsize/4 call MPI_Type_get_extent(darray, lb, locextent, ierr) ! Initialize local arrays allocate(myA(nelements)) allocate(myZ(nelements)) allocate(eigenvals(n), expected(n)) ! read in the data call MPI_File_open(MPI_COMM_WORLD, trim(filename), MPI_MODE_RDONLY, MPI_INFO_NULL, infile, ierr) disp = 4 ! skip N = 4 bytes call MPI_File_set_view(infile, disp, MPI_REAL, darray, "native", MPI_INFO_NULL, ierr) call MPI_File_read_all(infile, myA, nelements, MPI_REAL, mpistatus, ierr) call MPI_File_close(infile,ierr) iotime = tock(clock) if (mpirank == 0) then print *,'I/O time = ', iotime endif ! Initialize blacs processor grid call tick(clock) call blacs_pinfo (me,procs) call blacs_get (-1, 0, icontxt) call blacs_gridinit(icontxt, 'R', prow, pcol) call blacs_gridinfo(icontxt, prow, pcol, myrow, mycol) myArows = numroc(n, nb, myrow, 0, prow) myAcols = numroc(n, nb, mycol, 0, pcol) ! Construct local arrays ! Global structure: matrix A of n rows and n columns ! Prepare array descriptors for ScaLAPACK call descinit( ides_a, n, n, nb, nb, 0, 0, icontxt, myArows, info ) call descinit( ides_z, n, n, nb, nb, 0, 0, icontxt, myArows, info ) ! Call ScaLAPACK library routine allocate(work(1), iwork(1)) iwork(1) = -1 work(1) = -1. call pssyevr( 'V', 'A', 'U', n, myA, 1, 1, ides_a, -1.e20, +1.e20, 1, n, & m, nz, eigenvals, myZ, 1, 1, ides_z, work, -1, & iwork, -1, info ) worksize = int(work(1))/2*3 iworksize = iwork(1)/2*3 print *, 'Local workspace ', worksize call MPI_Reduce(worksize, totwork, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr) if (mpirank == 0) print *, ' total work space ', totwork call MPI_Reduce(iworksize, totwork, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr) if (mpirank == 0) print *, ' total iwork space ', totwork deallocate(work,iwork) allocate(work(worksize),iwork(iworksize)) call pssyev('N','U',n,myA,1,1,ides_a,eigenvals,myZ,1,1,ides_z,work,worksize,info) if (info /= 0) then print *, 'Error: info = ', info else if (mpirank == 0) then print *, 'Calculated ', m, ' eigenvalues and ', nz, ' eigenvectors.' endif ! Deallocate the local arrays deallocate(myA, myZ) deallocate(work, iwork) ! End blacs for processors that are used call blacs_gridexit(icontxt) calctime = tock(clock) ! calculated the expected eigenvalues for a particular test matrix p = 3. + sqrt((4. * n - 3.) * (n - 1.)*3./(n+1.)) expected(1) = p/(n*(5.-2.*n)) expected(2) = 6./(p*(n + 1.)) expected(3:n) = 1. ! Print results if (me == 0) then if (info /= 0) then print *, 'Error -- info = ', info endif print *,'Eigenvalues L_infty err = ', & maxval(abs(eigenvals-expected)) print *,'Compute time = ', calctime endif deallocate(eigenvals, expected) call MPI_Finalize(ierr) contains subroutine tick(t) integer, intent(OUT) :: t call system_clock(t) end subroutine tick ! returns time in seconds from now to time described by t real function tock(t) integer, intent(in) :: t integer :: now, clock_rate call system_clock(now,clock_rate) tock = real(now - t)/real(clock_rate) end function tock end program darray

MPI IO读写块循环矩阵

为什么要在C和C ++中避免使用system（）函数？

使用指定的初始值设定项初始化数组时的奇怪值

修改const int时gcc -O2标志会产生影响

如何在GCC x86中使用RDTSC计算时钟周期？

mingw32-g ++。exe：错误：CreateProcess：再没有这样的文件或目录

为什么pthread_create（）会失败，只有2个线程处于活动状态？

C有哈希/字典数据结构吗？

C中的fwrite在输出文件中给出不同的值

c中的strptime与时区偏移

如何在二进制代码编译后查看C代码？