MPI IO读写块循环矩阵

我有一个学校项目在hpc分布式系统上进行矩阵乘法。

我需要从并行IO系统读取矩阵并使用pblacs在许多计算节点(处理器)上并行执行矩阵乘法。 必须使用MPI IO命令读取数据。 我知道PBlacs使用块循环分布来执行乘法。

教授没有向我们提供有关MPI IO的更多信息,而且我很难在其上找到更多的信息/资源。 具体来说,是否有方法以块循环方式从并行io系统中读取矩阵并轻松将其插入到pblacs pdgemm中?

任何指向有用资源的指针都会非常感激。 我的时间有点短暂,对这个项目缺乏方向感到沮丧。

这实际上相对简单(如果你已经了解了一些关于blacs / scalapack和mpi-io的东西!)但即便如此,文档 – 甚至是在线 – 就像你发现的那样,有点差。

首先要了解的是MPI-IO,它允许您使用普通的MPI数据类型来指定文件的每个进程“视图”,然后只读取属于该视图的数据。 在我们的中心,我们有关于并行IO的半天课程的幻灯片和源代码; 大约三分之一是关于MPI-IO的基础知识。 这里有幻灯片, 这里有示例源代码。

第二件事是MPI有一种内置的方法来创建“分布式arrays”数据类型,其中一个组合允许您布局块循环数据分布; 我在回答这个问题时的一般性讨论: mpi中的darray和子数组有什么区别? 。

这意味着如果你有一个包含大矩阵的二进制文件,你可以使用MPI_Type_create_darray用mpi-io读取它,它将以块循环方式由任务分发。 那么这只是做blacs或scalapack调用的问题。 在我对计算科学堆栈交换的问题的答案中列出了使用scalapack psgemv进行矩阵向量乘法而不是psgemm的示例程序。

为了让您了解这些部分如何组合在一起,以下是一个简单的程序,它读入包含矩阵的二进制文件(首先是方形矩阵N的大小,然后是N ^ 2个元素),然后计算特征值和使用scalapack(新) pssyevr例程的向量。 它结合了MPI-IO,darray和scalapack的东西。 它在Fortran中,但函数调用在基于C语言中是相同的。

 ! ! Use MPI-IO to read a diagonal matrix distributed block-cyclically, ! use Scalapack to calculate its eigenvalues, and compare ! to expected results. ! program darray use mpi implicit none integer :: n, nb ! problem size and block size integer :: myArows, myAcols ! size of local subset of global array real :: p real, dimension(:), allocatable :: myA, myZ real, dimension(:), allocatable :: work integer, dimension(:), allocatable :: iwork real, dimension(:), allocatable :: eigenvals real, dimension(:), allocatable :: expected integer :: worksize, totwork, iworksize integer, external :: numroc ! blacs routine integer :: me, procs, icontxt, prow, pcol, myrow, mycol ! blacs data integer :: info ! scalapack return value integer, dimension(9) :: ides_a, ides_z ! scalapack array desc integer :: clock real :: calctime, iotime character(len=128) :: filename integer :: mpirank integer :: ierr integer, dimension(2) :: pdims, dims, distribs, dargs integer :: infile integer, dimension(MPI_STATUS_SIZE) :: mpistatus integer :: darray integer :: locsize, nelements integer(kind=MPI_ADDRESS_KIND) :: lb, locextent integer(kind=MPI_OFFSET_KIND) :: disp integer :: nargs integer :: m, nz ! Initialize MPI (for MPI-IO) call MPI_Init(ierr) call MPI_Comm_size(MPI_COMM_WORLD,procs,ierr) call MPI_Comm_rank(MPI_COMM_WORLD,mpirank,ierr) ! May as well get the process grid from MPI_Dims_create pdims = 0 call MPI_Dims_create(procs, 2, pdims, ierr) prow = pdims(1) pcol = pdims(2) ! get filename nargs = command_argument_count() if (nargs /= 1) then print *,'Usage: darray filename' print *,' Where filename = name of binary matrix file.' call MPI_Abort(MPI_COMM_WORLD,1,ierr) endif call get_command_argument(1, filename) ! find the size of the array we'll be using call tick(clock) call MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, infile, ierr) call MPI_File_read_all(infile,n,1,MPI_INTEGER,mpistatus,ierr) call MPI_File_close(infile,ierr) ! create the darray that will read in the data. nb = 64 if (nb > (N/prow)) nb = N/prow if (nb > (N/pcol)) nb = N/pcol dims = [n,n] distribs = [MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC] dargs = [nb, nb] call MPI_Type_create_darray(procs, mpirank, 2, dims, distribs, dargs, & pdims, MPI_ORDER_FORTRAN, MPI_REAL, darray, ierr) call MPI_Type_commit(darray,ierr) call MPI_Type_size(darray, locsize, ierr) nelements = locsize/4 call MPI_Type_get_extent(darray, lb, locextent, ierr) ! Initialize local arrays allocate(myA(nelements)) allocate(myZ(nelements)) allocate(eigenvals(n), expected(n)) ! read in the data call MPI_File_open(MPI_COMM_WORLD, trim(filename), MPI_MODE_RDONLY, MPI_INFO_NULL, infile, ierr) disp = 4 ! skip N = 4 bytes call MPI_File_set_view(infile, disp, MPI_REAL, darray, "native", MPI_INFO_NULL, ierr) call MPI_File_read_all(infile, myA, nelements, MPI_REAL, mpistatus, ierr) call MPI_File_close(infile,ierr) iotime = tock(clock) if (mpirank == 0) then print *,'I/O time = ', iotime endif ! Initialize blacs processor grid call tick(clock) call blacs_pinfo (me,procs) call blacs_get (-1, 0, icontxt) call blacs_gridinit(icontxt, 'R', prow, pcol) call blacs_gridinfo(icontxt, prow, pcol, myrow, mycol) myArows = numroc(n, nb, myrow, 0, prow) myAcols = numroc(n, nb, mycol, 0, pcol) ! Construct local arrays ! Global structure: matrix A of n rows and n columns ! Prepare array descriptors for ScaLAPACK call descinit( ides_a, n, n, nb, nb, 0, 0, icontxt, myArows, info ) call descinit( ides_z, n, n, nb, nb, 0, 0, icontxt, myArows, info ) ! Call ScaLAPACK library routine allocate(work(1), iwork(1)) iwork(1) = -1 work(1) = -1. call pssyevr( 'V', 'A', 'U', n, myA, 1, 1, ides_a, -1.e20, +1.e20, 1, n, & m, nz, eigenvals, myZ, 1, 1, ides_z, work, -1, & iwork, -1, info ) worksize = int(work(1))/2*3 iworksize = iwork(1)/2*3 print *, 'Local workspace ', worksize call MPI_Reduce(worksize, totwork, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr) if (mpirank == 0) print *, ' total work space ', totwork call MPI_Reduce(iworksize, totwork, 1, MPI_INTEGER, MPI_SUM, 0, MPI_COMM_WORLD, ierr) if (mpirank == 0) print *, ' total iwork space ', totwork deallocate(work,iwork) allocate(work(worksize),iwork(iworksize)) call pssyev('N','U',n,myA,1,1,ides_a,eigenvals,myZ,1,1,ides_z,work,worksize,info) if (info /= 0) then print *, 'Error: info = ', info else if (mpirank == 0) then print *, 'Calculated ', m, ' eigenvalues and ', nz, ' eigenvectors.' endif ! Deallocate the local arrays deallocate(myA, myZ) deallocate(work, iwork) ! End blacs for processors that are used call blacs_gridexit(icontxt) calctime = tock(clock) ! calculated the expected eigenvalues for a particular test matrix p = 3. + sqrt((4. * n - 3.) * (n - 1.)*3./(n+1.)) expected(1) = p/(n*(5.-2.*n)) expected(2) = 6./(p*(n + 1.)) expected(3:n) = 1. ! Print results if (me == 0) then if (info /= 0) then print *, 'Error -- info = ', info endif print *,'Eigenvalues L_infty err = ', & maxval(abs(eigenvals-expected)) print *,'Compute time = ', calctime endif deallocate(eigenvals, expected) call MPI_Finalize(ierr) contains subroutine tick(t) integer, intent(OUT) :: t call system_clock(t) end subroutine tick ! returns time in seconds from now to time described by t real function tock(t) integer, intent(in) :: t integer :: now, clock_rate call system_clock(now,clock_rate) tock = real(now - t)/real(clock_rate) end function tock end program darray