diff --git a/SRC/d3DPartition.c b/SRC/d3DPartition.c index 403b3ee5..d15f2c95 100644 --- a/SRC/d3DPartition.c +++ b/SRC/d3DPartition.c @@ -96,6 +96,13 @@ SupernodeToGridMap_t* createSuperGridMap(int_t nsuper,int_t maxLvl, int_t *myTre } void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) { + + gridinfo_t* grid = &(grid3d->grid2d); + int iam = grid3d->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter newTrfPartitionInit()"); +#endif + // check parameters if (LUstruct->trf3Dpart == NULL || grid3d == NULL) { @@ -109,6 +116,13 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr // Conversion of supernodal etree to list treeList_t *treeList = setree2list(nsupers, setree); +// YL: The essential difference between this function and dinitTrf3Dpartition_allgrid to avoid calling pddistribute* twice is that Piyush has removed the treelist weight update function below (and iperm_c_supno as well), which requires the LU data structure +#if 0 + /*update treelist with weight and depth*/ + getSCUweight_allgrid(nsupers, treeList, xsup, + LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr, + grid3d); +#endif // Calculation of tree weight calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup); @@ -130,10 +144,26 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr // sForests, LUstruct, grid3d); int_t *myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests); int_t **treePerm = getTreePermFr(myTreeIdxs, sForests, grid3d); + int* supernodeMask = SUPERLU_MALLOC(nsupers*sizeof(int)); + for (int ii = 0; ii < nsupers; ++ii) + supernodeMask[ii]=0; + for (int lvl = 0; lvl < maxLvl; ++lvl) + { + // printf("iam %5d lvl %5d myNodeCount[lvl] %5d\n",grid3d->iam, lvl,myNodeCount[lvl]); + for (int nd = 0; nd < myNodeCount[lvl]; ++nd) + { + supernodeMask[treePerm[lvl][nd]]=1; + } + } + + + + // dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t)); // dLluBufInit(LUvsb, LUstruct); - + +#if (DEBUGlevel>=1) // let count sum of gnodecount int_t gNodeCountSum = 0; for (int_t i = 0; i < (1 << maxLvl) - 1; ++i) @@ -141,6 +171,80 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr gNodeCountSum += gNodeCount[i]; } printf(" Iam: %d, Nsupers %d, gnodecountSum =%d \n", grid3d->iam, nsupers, gNodeCountSum); +#endif + + /* Sherry 2/17/23 + Compute buffer sizes needed for diagonal LU blocks and C matrices in GEMM. */ + + + iam = grid->iam; /* 'grid' is 2D grid */ + int k, k0, k_st, k_end, offset, nsupc, krow, kcol; + int myrow = MYROW (iam, grid); + int mycol = MYCOL (iam, grid); + int_t *xsup = LUstruct->Glu_persist->xsup; + +#if 0 + int krow = PROW (k, grid); + int kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; +#endif + + int mxLeafNode = 0; // Yang: only need to check the leaf level of topoInfo as the factorization proceeds level by level + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + + // Yang: use ldts to track the maximum needed buffer sizes per node of topoInfo + //int *ldts = (int*) SUPERLU_MALLOC(mxLeafNode*sizeof(int)); + //for (int i = 0; i < mxLeafNode; ++i) { //???????? + //ldts[i]=1; + //} + int *ldts = int32Calloc_dist(mxLeafNode); + + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { /* Loop through the Pz tree levels */ + int treeId = myTreeIdxs[ilvl]; + sForest_t* sforest = sForests[treeId]; + if (sforest){ + int_t *perm_node = sforest->nodeList ; /* permuted list, in order of factorization */ + int maxTopoLevel = sforest->topoInfo.numLvl;/* number of levels at each outer-tree node */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + { + /* code */ + k_st = sforest->topoInfo.eTreeTopLims[topoLvl]; + k_end = sforest->topoInfo.eTreeTopLims[topoLvl + 1]; + //printf("\t..topoLvl %d, k_st %d, k_end %d\n", topoLvl, k_st, k_end); + + for (int k0 = k_st; k0 < k_end; ++k0) + { + offset = k0 - k_st; + k = perm_node[k0]; + nsupc = (xsup[k+1]-xsup[k]); + krow = PROW (k, grid); + kcol = PCOL (k, grid); + if ( myrow == krow || mycol == kcol ) /* diagonal process */ + { + ldts[offset] = SUPERLU_MAX(ldts[offset], nsupc); + } +#if 0 /* GPU gemm buffers can only be set on GPU side, because here we only know + the size of U data structure on CPU. It is different on GPU */ + if ( mycol == kcol ) { /* processes owning L panel */ + + } + if ( myrow == krow ) + gemmCsizes[offset] = SUPERLU_MAX(ldts[offset], ???); +#endif + } + } + } + } + + + trf3Dpart->gEtreeInfo = fillEtreeInfo(nsupers, setree, treeList); // trf3Dpart->iperm_c_supno = iperm_c_supno; @@ -149,11 +253,22 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr trf3Dpart->myZeroTrIdxs = myZeroTrIdxs; trf3Dpart->sForests = sForests; trf3Dpart->treePerm = treePerm; + trf3Dpart->maxLvl = maxLvl; // trf3Dpart->LUvsb = LUvsb; trf3Dpart->supernode2treeMap = createSupernode2TreeMap(nsupers, maxLvl, gNodeCount, gNodeLists); trf3Dpart->superGridMap = createSuperGridMap(nsupers, maxLvl, myTreeIdxs, myZeroTrIdxs, gNodeCount, gNodeLists); - - + trf3Dpart->supernodeMask = supernodeMask; + trf3Dpart->mxLeafNode = mxLeafNode; // Sherry added these 3 + trf3Dpart->diagDims = ldts; + //trf3Dpart->gemmCsizes = gemmCsizes; + // Sherry added + // Deallocate storage + SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeLists); + free_treelist(nsupers, treeList); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit newTrfPartitionInit()"); +#endif } @@ -326,18 +441,28 @@ void bcastPermutedSparseA(SuperMatrix *A, // beyond the last row, so that rowptr[n_loc] = nnz_loc.*/ // } NRformat_loc; - + // NRformat_loc *Astore = (NRformat_loc *) A->Store; MPI_Bcast(&(Astore->nnz_loc), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm); MPI_Bcast(&(Astore->m_loc), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm); MPI_Bcast(&(Astore->fst_row), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm); + + +/***** YL: remove the allocation in the following as dGatherNRformat_loc3d_allgrid instead of dGatherNRformat_loc3d has been called, which already allocate A->Store on all grids + * Note the the broadcast is still needed as the A->Store has been scaled by scaleMatrixDiagonally only on grid 0 +*/ +#if 1 + MPI_Bcast(Astore->nzval, Astore->nnz_loc*sizeof(double), MPI_BYTE, 0, grid3d->zscp.comm); + MPI_Bcast(Astore->rowptr, (Astore->m_loc+1)*sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm); + MPI_Bcast(Astore->colind, Astore->nnz_loc*sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm); +#else allocBcastArray( &(Astore->nzval), Astore->nnz_loc*sizeof(double), 0, grid3d->zscp.comm); allocBcastArray( &(Astore->rowptr), (Astore->m_loc+1)*sizeof(int_t), 0, grid3d->zscp.comm); allocBcastArray( &(Astore->colind), Astore->nnz_loc*sizeof(int_t), 0, grid3d->zscp.comm); - +#endif } diff --git a/SRC/pddistribute-aux3d.c b/SRC/pddistribute-aux3d.c index 9e868960..04ab6f0f 100644 --- a/SRC/pddistribute-aux3d.c +++ b/SRC/pddistribute-aux3d.c @@ -504,6 +504,8 @@ int_t checkDist3DLUStruct( dLUstruct_t* LUstruct, gridinfo3d_t* grid3d) } } } - printf("Check 3D LU structure passed\n"); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit checkDist3DLUStruct()"); +#endif return 0; } \ No newline at end of file diff --git a/SRC/pddistribute3d.c b/SRC/pddistribute3d.c index e1aae876..968d7d8b 100644 --- a/SRC/pddistribute3d.c +++ b/SRC/pddistribute3d.c @@ -439,41 +439,19 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, int *index1; /* temporary pointer to array of int */ double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */ - long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */ - long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */ - long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */ int_t *Unnz; /* size ceil(NSUPERS/Pc) */ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - double *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */ - long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */ - long int Unzval_br_cnt = 0; int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */ - long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */ - long int Ufstnz_br_cnt = 0; - - C_Tree *LBtree_ptr = NULL; /* size ceil(NSUPERS/Pc) */ - C_Tree *LRtree_ptr = NULL; /* size ceil(NSUPERS/Pr) */ - C_Tree *UBtree_ptr = NULL; /* size ceil(NSUPERS/Pc) */ - C_Tree *URtree_ptr = NULL; /* size ceil(NSUPERS/Pr) */ + int msgsize; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr; /* Vertical linked list pointing to Uindex[] */ - Ucb_indptr_t *Ucb_inddat; - long int *Ucb_indoffset; - long int Ucb_indcnt = 0; int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ - int_t *Ucb_valdat; - long int *Ucb_valoffset; - long int Ucb_valcnt = 0; /*-- Counts to be used in factorization. --*/ int *ToRecv, *ToSendD, **ToSendR; @@ -522,11 +500,7 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, int *frecv, *brecv; int_t *lloc; double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */ - long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */ - long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */ double *SeedSTD_BC, *SeedSTD_RD; int_t idx_indx, idx_lusup; int_t nbrow; @@ -623,20 +597,8 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, if (!(Unzval_br_ptr = (double **)SUPERLU_MALLOC(k * sizeof(double *)))) ABORT("Malloc fails for Unzval_br_ptr[]."); - if (!(Unzval_br_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Unzval_br_offset[]."); - } - Unzval_br_offset[k - 1] = -1; if (!(Ufstnz_br_ptr = (int_t **)SUPERLU_MALLOC(k * sizeof(int_t *)))) ABORT("Malloc fails for Ufstnz_br_ptr[]."); - if (!(Ufstnz_br_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Ufstnz_br_offset[]."); - } - Ufstnz_br_offset[k - 1] = -1; if (!(ToSendD = SUPERLU_MALLOC(k * sizeof(int)))) ABORT("Malloc fails for ToSendD[]."); @@ -765,12 +727,8 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, if (!(index = intMalloc_dist(len1 + 1))) ABORT("Malloc fails for Uindex[]."); Ufstnz_br_ptr[lb] = index; - Ufstnz_br_offset[lb] = len1 + 1; - Ufstnz_br_cnt += Ufstnz_br_offset[lb]; if (!(Unzval_br_ptr[lb] = doubleMalloc_dist(len))) ABORT("Malloc fails for Unzval_br_ptr[*][]."); - Unzval_br_offset[lb] = len; - Unzval_br_cnt += Unzval_br_offset[lb]; mybufmax[2] = SUPERLU_MAX(mybufmax[2], len1); mybufmax[3] = SUPERLU_MAX(mybufmax[3], len); @@ -783,8 +741,6 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, { Ufstnz_br_ptr[lb] = NULL; Unzval_br_ptr[lb] = NULL; - Unzval_br_offset[lb] = -1; - Ufstnz_br_offset[lb] = -1; } Urb_length[lb] = 0; /* Reset block length. */ Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ @@ -835,54 +791,24 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, if (!(Lrowind_bc_ptr = (int_t **)SUPERLU_MALLOC(k * sizeof(int_t *)))) ABORT("Malloc fails for Lrowind_bc_ptr[]."); Lrowind_bc_ptr[k - 1] = NULL; - if (!(Lrowind_bc_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Lrowind_bc_offset[]."); - } - Lrowind_bc_offset[k - 1] = -1; - if (!(Lnzval_bc_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Lnzval_bc_offset[]."); - } - Lnzval_bc_offset[k - 1] = -1; if (!(Lindval_loc_bc_ptr = (int_t **)SUPERLU_MALLOC(k * sizeof(int_t *)))) ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); Lindval_loc_bc_ptr[k - 1] = NULL; - if (!(Lindval_loc_bc_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[]."); - } - Lindval_loc_bc_offset[k - 1] = -1; if (!(Linv_bc_ptr = (double **)SUPERLU_MALLOC(k * sizeof(double *)))) { fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); } - if (!(Linv_bc_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Linv_bc_offset[]."); - } if (!(Uinv_bc_ptr = (double **)SUPERLU_MALLOC(k * sizeof(double *)))) { fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); } - if (!(Uinv_bc_offset = - (long int *)SUPERLU_MALLOC(k * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Uinv_bc_offset[]."); - } Linv_bc_ptr[k - 1] = NULL; Uinv_bc_ptr[k - 1] = NULL; - Linv_bc_offset[k - 1] = -1; - Uinv_bc_offset[k - 1] = -1; if (!(Unnz = (int_t *)SUPERLU_MALLOC(k * sizeof(int_t)))) @@ -914,11 +840,6 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. ------------------------------------------------------------*/ - long int Linv_bc_cnt = 0; - long int Uinv_bc_cnt = 0; - long int Lrowind_bc_cnt = 0; - long int Lnzval_bc_cnt = 0; - long int Lindval_loc_bc_cnt = 0; for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ @@ -1091,17 +1012,11 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; if (!(index = intMalloc_dist(len1))) ABORT("Malloc fails for index[]"); - Lrowind_bc_offset[ljb] = len1; - Lrowind_bc_cnt += Lrowind_bc_offset[ljb]; if (!(lusup = (double *)SUPERLU_MALLOC(len * nsupc * sizeof(double)))) ABORT("Malloc fails for lusup[]"); - Lnzval_bc_offset[ljb] = len * nsupc; - Lnzval_bc_cnt += Lnzval_bc_offset[ljb]; if (!(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl * 3))) ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); - Lindval_loc_bc_offset[ljb] = nrbl * 3; - Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb]; myrow = MYROW(iam, grid); krow = PROW(jb, grid); @@ -1109,20 +1024,14 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, { /* diagonal block */ if (!(Linv_bc_ptr[ljb] = (double *)SUPERLU_MALLOC(nsupc * nsupc * sizeof(double)))) ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); - Linv_bc_offset[ljb] = nsupc * nsupc; - Linv_bc_cnt += Linv_bc_offset[ljb]; if (!(Uinv_bc_ptr[ljb] = (double *)SUPERLU_MALLOC(nsupc * nsupc * sizeof(double)))) ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); - Uinv_bc_offset[ljb] = nsupc * nsupc; - Uinv_bc_cnt += Uinv_bc_offset[ljb]; } else { Linv_bc_ptr[ljb] = NULL; - Linv_bc_offset[ljb] = -1; Uinv_bc_ptr[ljb] = NULL; - Uinv_bc_offset[ljb] = -1; } mybufmax[0] = SUPERLU_MAX(mybufmax[0], len1); @@ -1241,12 +1150,7 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; Linv_bc_ptr[ljb] = NULL; - Linv_bc_offset[ljb] = -1; - Lrowind_bc_offset[ljb] = -1; - Lindval_loc_bc_offset[ljb] = -1; - Lnzval_bc_offset[ljb] = -1; Uinv_bc_ptr[ljb] = NULL; - Uinv_bc_offset[ljb] = -1; Lindval_loc_bc_ptr[ljb] = NULL; } /* if nrbl ... */ @@ -1267,126 +1171,12 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, Lrowind_bc_ptr[ljb] = NULL; Lnzval_bc_ptr[ljb] = NULL; Linv_bc_ptr[ljb] = NULL; - Linv_bc_offset[ljb] = -1; - Lrowind_bc_offset[ljb] = -1; - Lindval_loc_bc_offset[ljb] = -1; - Lnzval_bc_offset[ljb] = -1; Uinv_bc_ptr[ljb] = NULL; - Uinv_bc_offset[ljb] = -1; Lindval_loc_bc_ptr[ljb] = NULL; } } } /* for jb ... */ - Linv_bc_cnt += 1; // safe guard - Uinv_bc_cnt += 1; - Lrowind_bc_cnt += 1; - Lindval_loc_bc_cnt += 1; - Lnzval_bc_cnt += 1; - if (!(Linv_bc_dat = - (double *)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(double)))) - { - fprintf(stderr, "Malloc fails for Linv_bc_dat[]."); - } - if (!(Uinv_bc_dat = - (double *)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(double)))) - { - fprintf(stderr, "Malloc fails for Uinv_bc_dat[]."); - } - - if (!(Lrowind_bc_dat = - (int_t *)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t)))) - { - fprintf(stderr, "Malloc fails for Lrowind_bc_dat[]."); - } - if (!(Lindval_loc_bc_dat = - (int_t *)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t)))) - { - fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[]."); - } - if (!(Lnzval_bc_dat = - (double *)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(double)))) - { - fprintf(stderr, "Malloc fails for Lnzval_bc_dat[]."); - } - - /* use contingous memory for Linv_bc_ptr, Uinv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/ - k = CEILING(nsupers, grid->npcol); /* Number of local block columns */ - Linv_bc_cnt = 0; - Uinv_bc_cnt = 0; - Lrowind_bc_cnt = 0; - Lnzval_bc_cnt = 0; - Lindval_loc_bc_cnt = 0; - long int tmp_cnt; - for (jb = 0; jb < k; ++jb) - { /* for each block column ... */ - if (Linv_bc_ptr[jb] != NULL) - { - for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) - { - Linv_bc_dat[Linv_bc_cnt + jj] = Linv_bc_ptr[jb][jj]; - } - SUPERLU_FREE(Linv_bc_ptr[jb]); - Linv_bc_ptr[jb] = &Linv_bc_dat[Linv_bc_cnt]; - tmp_cnt = Linv_bc_offset[jb]; - Linv_bc_offset[jb] = Linv_bc_cnt; - Linv_bc_cnt += tmp_cnt; - } - - if (Uinv_bc_ptr[jb] != NULL) - { - for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) - { - Uinv_bc_dat[Uinv_bc_cnt + jj] = Uinv_bc_ptr[jb][jj]; - } - SUPERLU_FREE(Uinv_bc_ptr[jb]); - Uinv_bc_ptr[jb] = &Uinv_bc_dat[Uinv_bc_cnt]; - tmp_cnt = Uinv_bc_offset[jb]; - Uinv_bc_offset[jb] = Uinv_bc_cnt; - Uinv_bc_cnt += tmp_cnt; - } - - if (Lrowind_bc_ptr[jb] != NULL) - { - for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) - { - Lrowind_bc_dat[Lrowind_bc_cnt + jj] = Lrowind_bc_ptr[jb][jj]; - } - SUPERLU_FREE(Lrowind_bc_ptr[jb]); - Lrowind_bc_ptr[jb] = &Lrowind_bc_dat[Lrowind_bc_cnt]; - tmp_cnt = Lrowind_bc_offset[jb]; - Lrowind_bc_offset[jb] = Lrowind_bc_cnt; - Lrowind_bc_cnt += tmp_cnt; - } - - if (Lnzval_bc_ptr[jb] != NULL) - { - for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) - { - Lnzval_bc_dat[Lnzval_bc_cnt + jj] = Lnzval_bc_ptr[jb][jj]; - } - SUPERLU_FREE(Lnzval_bc_ptr[jb]); - Lnzval_bc_ptr[jb] = &Lnzval_bc_dat[Lnzval_bc_cnt]; - tmp_cnt = Lnzval_bc_offset[jb]; - Lnzval_bc_offset[jb] = Lnzval_bc_cnt; - Lnzval_bc_cnt += tmp_cnt; - } - - if (Lindval_loc_bc_ptr[jb] != NULL) - { - for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) - { - Lindval_loc_bc_dat[Lindval_loc_bc_cnt + jj] = Lindval_loc_bc_ptr[jb][jj]; - } - SUPERLU_FREE(Lindval_loc_bc_ptr[jb]); - Lindval_loc_bc_ptr[jb] = &Lindval_loc_bc_dat[Lindval_loc_bc_cnt]; - tmp_cnt = Lindval_loc_bc_offset[jb]; - Lindval_loc_bc_offset[jb] = Lindval_loc_bc_cnt; - Lindval_loc_bc_cnt += tmp_cnt; - } - } /* for jb ... */ - - ///////////////////////////////////////////////////////////////// /* Set up additional pointers for the index and value arrays of U. nub is the number of local block columns. */ @@ -1399,18 +1189,6 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, ABORT("Malloc fails for Ucb_indptr[]"); if (!(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *)))) ABORT("Malloc fails for Ucb_valptr[]"); - if (!(Ucb_valoffset = - (long int *)SUPERLU_MALLOC(nub * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Ucb_valoffset[]."); - } - Ucb_valoffset[nub - 1] = -1; - if (!(Ucb_indoffset = - (long int *)SUPERLU_MALLOC(nub * sizeof(long int)))) - { - fprintf(stderr, "Malloc fails for Ucb_indoffset[]."); - } - Ucb_indoffset[nub - 1] = -1; nlb = CEILING(nsupers, grid->nprow); /* Number of local block rows. */ @@ -1440,19 +1218,13 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, { /* Not an empty block column. */ if (!(Ucb_indptr[lb] = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t)))) ABORT("Malloc fails for Ucb_indptr[lb][]"); - Ucb_indoffset[lb] = Urbs[lb]; - Ucb_indcnt += Ucb_indoffset[lb]; if (!(Ucb_valptr[lb] = (int_t *)intMalloc_dist(Urbs[lb]))) ABORT("Malloc fails for Ucb_valptr[lb][]"); - Ucb_valoffset[lb] = Urbs[lb]; - Ucb_valcnt += Ucb_valoffset[lb]; } else { Ucb_valptr[lb] = NULL; - Ucb_valoffset[lb] = -1; Ucb_indptr[lb] = NULL; - Ucb_indoffset[lb] = -1; } } for (lk = 0; lk < nlb; ++lk) @@ -1503,905 +1275,1196 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A, } } - Unzval_br_cnt += 1; // safe guard - Ufstnz_br_cnt += 1; - Ucb_valcnt += 1; - Ucb_indcnt += 1; - if (!(Unzval_br_dat = - (double *)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(double)))) - { - fprintf(stderr, "Malloc fails for Lnzval_bc_dat[]."); - } - if (!(Ufstnz_br_dat = - (int_t *)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t)))) - { - fprintf(stderr, "Malloc fails for Ufstnz_br_dat[]."); - } - if (!(Ucb_valdat = - (int_t *)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t)))) - { - fprintf(stderr, "Malloc fails for Ucb_valdat[]."); - } - if (!(Ucb_inddat = - (Ucb_indptr_t *)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t)))) - { - fprintf(stderr, "Malloc fails for Ucb_inddat[]."); - } - - /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */ - k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - Unzval_br_cnt = 0; - Ufstnz_br_cnt = 0; - for (lb = 0; lb < k; ++lb) - { /* for each block row ... */ - if (Unzval_br_ptr[lb] != NULL) - { - for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) - { - Unzval_br_dat[Unzval_br_cnt + jj] = Unzval_br_ptr[lb][jj]; - } - SUPERLU_FREE(Unzval_br_ptr[lb]); - Unzval_br_ptr[lb] = &Unzval_br_dat[Unzval_br_cnt]; - tmp_cnt = Unzval_br_offset[lb]; - Unzval_br_offset[lb] = Unzval_br_cnt; - Unzval_br_cnt += tmp_cnt; - } - - if (Ufstnz_br_ptr[lb] != NULL) - { - for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) - { - Ufstnz_br_dat[Ufstnz_br_cnt + jj] = Ufstnz_br_ptr[lb][jj]; - } - SUPERLU_FREE(Ufstnz_br_ptr[lb]); - Ufstnz_br_ptr[lb] = &Ufstnz_br_dat[Ufstnz_br_cnt]; - tmp_cnt = Ufstnz_br_offset[lb]; - Ufstnz_br_offset[lb] = Ufstnz_br_cnt; - Ufstnz_br_cnt += tmp_cnt; - } - } - - k = CEILING(nsupers, grid->npcol); /* Number of local block columns */ - Ucb_valcnt = 0; - Ucb_indcnt = 0; - for (lb = 0; lb < k; ++lb) - { /* for each block row ... */ - if (Ucb_valptr[lb] != NULL) - { - for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) - { - Ucb_valdat[Ucb_valcnt + jj] = Ucb_valptr[lb][jj]; - } - SUPERLU_FREE(Ucb_valptr[lb]); - Ucb_valptr[lb] = &Ucb_valdat[Ucb_valcnt]; - tmp_cnt = Ucb_valoffset[lb]; - Ucb_valoffset[lb] = Ucb_valcnt; - Ucb_valcnt += tmp_cnt; - } - if (Ucb_indptr[lb] != NULL) - { - for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) - { - Ucb_inddat[Ucb_indcnt + jj] = Ucb_indptr[lb][jj]; - } - SUPERLU_FREE(Ucb_indptr[lb]); - Ucb_indptr[lb] = &Ucb_inddat[Ucb_indcnt]; - tmp_cnt = Ucb_indoffset[lb]; - Ucb_indoffset[lb] = Ucb_indcnt; - Ucb_indcnt += tmp_cnt; - } - } /* for lb ... */ - - ///////////////////////////////////////////////////////////////// - #if (PROFlevel >= 1) t = SuperLU_timer_(); #endif - if (!grid3d->zscp.Iam) - { /* construct the Bcast tree for L ... */ - - k = CEILING(nsupers, grid->npcol); /* Number of local block columns */ - if (!(LBtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree)))) - ABORT("Malloc fails for LBtree_ptr[]."); - if (!(ActiveFlag = intCalloc_dist(grid->nprow * 2))) - ABORT("Calloc fails for ActiveFlag[]."); - if (!(ranks = (int *)SUPERLU_MALLOC(grid->nprow * sizeof(int)))) - ABORT("Malloc fails for ranks[]."); - if (!(SeedSTD_BC = (double *)SUPERLU_MALLOC(k * sizeof(double)))) - ABORT("Malloc fails for SeedSTD_BC[]."); - - for (i = 0; i < k; i++) - { - SeedSTD_BC[i] = rand(); - } - - MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_BC[0], k, MPI_DOUBLE, MPI_MAX, grid->cscp.comm); - - for (ljb = 0; ljb < k; ++ljb) - { - C_BcTree_Nullify(&LBtree_ptr[ljb]); - } - - if (!(ActiveFlagAll = intMalloc_dist(grid->nprow * k))) - ABORT("Calloc fails for ActiveFlag[]."); - memTRS += k * sizeof(C_Tree) + k * dword + grid->nprow * k * iword; // acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll - for (j = 0; j < grid->nprow * k; ++j) - ActiveFlagAll[j] = 3 * nsupers; - for (ljb = 0; ljb < k; ++ljb) - { /* for each local block column ... */ - jb = mycol + ljb * grid->npcol; /* not sure */ -#if 1 - // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam ) -#endif - { - if (jb < nsupers) - { - pc = PCOL(jb, grid); - fsupc = FstBlockC(jb); - nsupc = SuperSize(jb); - istart = xlsub[fsupc]; - for (i = istart; i < xlsub[fsupc + 1]; ++i) - { - irow = lsub[i]; - gb = BlockNum(irow); - pr = PROW(gb, grid); - ActiveFlagAll[pr + ljb * grid->nprow] = SUPERLU_MIN(ActiveFlagAll[pr + ljb * grid->nprow], gb); - } /* for j ... */ - } - } - } - for (ljb = 0; ljb < k; ++ljb) - { /* for each local block column ... */ + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; - jb = mycol + ljb * grid->npcol; /* not sure */ -#if 1 - // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam ) +#if (PRNTlevel >= 1) + if (!iam) + printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); #endif - { - if (jb < nsupers) - { - pc = PCOL(jb, grid); - - for (j = 0; j < grid->nprow; ++j) - ActiveFlag[j] = ActiveFlagAll[j + ljb * grid->nprow]; - for (j = 0; j < grid->nprow; ++j) - ActiveFlag[j + grid->nprow] = j; - for (j = 0; j < grid->nprow; ++j) - ranks[j] = -1; - - Root = -1; - Iactive = 0; - for (j = 0; j < grid->nprow; ++j) - { - if (ActiveFlag[j] != 3 * nsupers) - { - gb = ActiveFlag[j]; - pr = PROW(gb, grid); - if (gb == jb) - Root = pr; - if (myrow == pr) - Iactive = 1; - } - } - - quickSortM(ActiveFlag, 0, grid->nprow - 1, grid->nprow, 0, 2); - - if (Iactive == 1) - { - - assert(Root > -1); - rank_cnt = 1; - ranks[0] = Root; - for (j = 0; j < grid->nprow; ++j) - { - if (ActiveFlag[j] != 3 * nsupers && ActiveFlag[j + grid->nprow] != Root) - { - ranks[rank_cnt] = ActiveFlag[j + grid->nprow]; - ++rank_cnt; - } - } - - if (rank_cnt > 1) - { - - for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks - ranks[ii] = PNUM(ranks[ii], pc, grid); - msgsize = SuperSize(jb); - - C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd'); - LBtree_ptr[ljb].tag_ = BC_L; - - if (Root == myrow) - { - rank_cnt_ref = 1; - for (j = 0; j < grid->nprow; ++j) - { - if (fsendx_plist[ljb][j] != SLU_EMPTY) - { - ++rank_cnt_ref; - } - } - assert(rank_cnt == rank_cnt_ref); - } - // #endif - } - } - } - } - } + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); - SUPERLU_FREE(ActiveFlag); - SUPERLU_FREE(ActiveFlagAll); - SUPERLU_FREE(ranks); - SUPERLU_FREE(SeedSTD_BC); - memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_BC, ActiveFlagAll + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); -#if (PROFlevel >= 1) - t = SuperLU_timer_() - t; - if (!iam) - printf(".. Construct Bcast tree for L: %.2f\t\n", t); -#endif + k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ + if (!(Llu->mod_bit = int32Malloc_dist(k))) + ABORT("Malloc fails for mod_bit[]."); #if (PROFlevel >= 1) - t = SuperLU_timer_(); -#endif - /* construct the Reduce tree for L ... */ - /* the following is used as reference */ - nlb = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - if (!(mod_bit = int32Malloc_dist(nlb))) - ABORT("Malloc fails for mod_bit[]."); - if (!(frecv = int32Malloc_dist(nlb))) - ABORT("Malloc fails for frecv[]."); - - for (k = 0; k < nlb; ++k) - mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) - { - pr = PROW(k, grid); - if (myrow == pr) - { - lib = LBi(k, grid); /* local block number */ - kcol = PCOL(k, grid); - if (mycol == kcol || fmod[lib]) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } - } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ -#if 0 // Sherry: 1/26/2022 - MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); -#else - MPI_Allreduce(mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); + if (!iam) + printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); #endif - k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - if (!(LRtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree)))) - ABORT("Malloc fails for LRtree_ptr[]."); - if (!(ActiveFlag = intCalloc_dist(grid->npcol * 2))) - ABORT("Calloc fails for ActiveFlag[]."); - if (!(ranks = (int *)SUPERLU_MALLOC(grid->npcol * sizeof(int)))) - ABORT("Malloc fails for ranks[]."); - - if (!(SeedSTD_RD = (double *)SUPERLU_MALLOC(k * sizeof(double)))) - ABORT("Malloc fails for SeedSTD_RD[]."); - - for (i = 0; i < k; i++) - { - SeedSTD_RD[i] = rand(); - } - - MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_RD[0], k, MPI_DOUBLE, MPI_MAX, grid->rscp.comm); - - for (lib = 0; lib < k; ++lib) - { - C_RdTree_Nullify(&LRtree_ptr[lib]); - } + } /* else fact != SamePattern_SameRowPerm */ - if (!(ActiveFlagAll = intMalloc_dist(grid->npcol * k))) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j = 0; j < grid->npcol * k; ++j) - ActiveFlagAll[j] = -3 * nsupers; - memTRS += k * sizeof(C_Tree) + k * dword + grid->npcol * k * iword; // acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll - for (jb = 0; jb < nsupers; ++jb) - { /* for each block column ... */ - fsupc = FstBlockC(jb); - pc = PCOL(jb, grid); -#if 1 - // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam ) -#endif - { - for (i = xlsub[fsupc]; i < xlsub[fsupc + 1]; ++i) - { - irow = lsub[i]; - ib = BlockNum(irow); - pr = PROW(ib, grid); - if (myrow == pr) - { /* Block row ib in my process row */ - lib = LBi(ib, grid); /* Local block number */ - ActiveFlagAll[pc + lib * grid->npcol] = SUPERLU_MAX(ActiveFlagAll[pc + lib * grid->npcol], jb); - } - } - } - } + if (xa[A->ncol] > 0) + { /* may not have any entries on this process. */ + SUPERLU_FREE(asub); + SUPERLU_FREE(a); + } + SUPERLU_FREE(xa); - for (lib = 0; lib < k; ++lib) - { - ib = myrow + lib * grid->nprow; /* not sure */ -#if 1 - // if (superGridMap[ib] != NOT_IN_GRID || !grid3d->zscp.Iam ) +#if (DEBUGlevel >= 1) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit pddistribute3d()"); #endif - { - if (ib < nsupers) - { - pr = PROW(ib, grid); - for (j = 0; j < grid->npcol; ++j) - ActiveFlag[j] = ActiveFlagAll[j + lib * grid->npcol]; - ; - for (j = 0; j < grid->npcol; ++j) - ActiveFlag[j + grid->npcol] = j; - for (j = 0; j < grid->npcol; ++j) - ranks[j] = -1; - Root = -1; - Iactive = 0; - - for (j = 0; j < grid->npcol; ++j) - { - if (ActiveFlag[j] != -3 * nsupers) - { - jb = ActiveFlag[j]; - pc = PCOL(jb, grid); - if (jb == ib) - Root = pc; - if (mycol == pc) - Iactive = 1; - } - } - - quickSortM(ActiveFlag, 0, grid->npcol - 1, grid->npcol, 1, 2); - - if (Iactive == 1) - { - assert(Root > -1); - rank_cnt = 1; - ranks[0] = Root; - for (j = 0; j < grid->npcol; ++j) - { - if (ActiveFlag[j] != -3 * nsupers && ActiveFlag[j + grid->npcol] != Root) - { - ranks[rank_cnt] = ActiveFlag[j + grid->npcol]; - ++rank_cnt; - } - } - if (rank_cnt > 1) - { - - for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks - ranks[ii] = PNUM(pr, ranks[ii], grid); - - msgsize = SuperSize(ib); - C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd'); - LRtree_ptr[lib].tag_ = RD_L; - } - } - } - } - } - - SUPERLU_FREE(mod_bit); - SUPERLU_FREE(frecv); + return (mem_use + memTRS); - SUPERLU_FREE(ActiveFlag); - SUPERLU_FREE(ActiveFlagAll); - SUPERLU_FREE(ranks); +} /* PDDISTRIBUTE3D */ - SUPERLU_FREE(SeedSTD_RD); - memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_RD, ActiveFlagAll - //////////////////////////////////////////////////////// -#if (PROFlevel >= 1) - t = SuperLU_timer_() - t; - if (!iam) - printf(".. Construct Reduce tree for L: %.2f\t\n", t); -#endif - -#if (PROFlevel >= 1) - t = SuperLU_timer_(); -#endif +float +pddistribute3d_Yang(superlu_dist_options_t *options, int_t n, SuperMatrix *A, + dScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, dLUstruct_t *LUstruct, + gridinfo3d_t *grid3d) +/* + * -- Distributed SuperLU routine (version 2.0) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley. + * March 15, 2003 + * + * + * Purpose + * ======= + * Distribute the matrix onto the 2D process mesh on all girds based on superGridMap created by Piyush + * + * Arguments + * ========= + * + * options (input) superlu_dist_options_t* + * options->Fact Specifies whether or not the L and U structures will be re-used. + * = SamePattern_SameRowPerm: L and U structures are input, and + * unchanged on exit. + * = DOFACT or SamePattern: L and U structures are computed and output. + * + * n (input) int + * Dimension of the matrix. + * + * A (input) SuperMatrix* + * The distributed input matrix A of dimension (A->nrow, A->ncol). + * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: + * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. + * + * ScalePermstruct (input) dScalePermstruct_t* + * The data structure to store the scaling and permutation vectors + * describing the transformations performed to the original matrix A. + * + * Glu_freeable (input) *Glu_freeable_t + * The global structure describing the graph of L and U. + * + * LUstruct (input) dLUstruct_t* + * Data structures for L and U factors. + * + * grid (input) gridinfo_t* + * The 2D process mesh. + * + * Return value + * ============ + * > 0, working storage required (in bytes). + * + */ +{ + gridinfo_t *grid = &(grid3d->grid2d); + dtrf3Dpartition_t *trf3Dpart = LUstruct->trf3Dpart; /* Data structure containing 3D partition info */ + SupernodeToGridMap_t *superGridMap = trf3Dpart->superGridMap; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + dLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc, masked; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + double *a; + int_t *asub, *xa; + int_t *xa_begin, *xa_end; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ - /* construct the Bcast tree for U ... */ + double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */ + long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */ + + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */ + long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */ + + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */ + long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */ + + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + double *Unzval_br_dat; /* size sum of sizes of Unzval_br_ptr[lk]) */ + long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */ + long int Unzval_br_cnt=0; + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t *Ufstnz_br_dat; /* size sum of sizes of Ufstnz_br_ptr[lk]) */ + long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */ + long int Ufstnz_br_cnt=0; + + C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + Ucb_indptr_t *Ucb_inddat; + long int *Ucb_indoffset; + long int Ucb_indcnt=0; - k = CEILING(nsupers, grid->npcol); /* Number of local block columns */ - if (!(UBtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree)))) - ABORT("Malloc fails for UBtree_ptr[]."); - if (!(ActiveFlag = intCalloc_dist(grid->nprow * 2))) - ABORT("Calloc fails for ActiveFlag[]."); - if (!(ranks = (int *)SUPERLU_MALLOC(grid->nprow * sizeof(int)))) - ABORT("Malloc fails for ranks[]."); - if (!(SeedSTD_BC = (double *)SUPERLU_MALLOC(k * sizeof(double)))) - ABORT("Malloc fails for SeedSTD_BC[]."); + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t *Ucb_valdat; + long int *Ucb_valoffset; + long int Ucb_valcnt=0; - for (i = 0; i < k; i++) - { - SeedSTD_BC[i] = rand(); - } + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; - MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_BC[0], k, MPI_DOUBLE, MPI_MAX, grid->cscp.comm); + /*-- Counts to be used in lower triangular solve. --*/ + int *fmod; /* Modification count for L-solve. */ + int **fsendx_plist; /* Column process list to send down Xk. */ + int nfrecvx = 0; /* Number of Xk I will receive. */ + int nfsendx = 0; /* Number of Xk I will send */ + int kseen; - for (ljb = 0; ljb < k; ++ljb) - { - C_BcTree_Nullify(&UBtree_ptr[ljb]); - } + /*-- Counts to be used in upper triangular solve. --*/ + int *bmod; /* Modification count for U-solve. */ + int **bsendx_plist; /* Column process list to send down Xk. */ + int nbrecvx = 0; /* Number of Xk I will receive. */ + int nbsendx = 0; /* Number of Xk I will send */ - if (!(ActiveFlagAll = intMalloc_dist(grid->nprow * k))) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j = 0; j < grid->nprow * k; ++j) - ActiveFlagAll[j] = -3 * nsupers; - memTRS += k * sizeof(C_Tree) + k * dword + grid->nprow * k * iword; // acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ - for (ljb = 0; ljb < k; ++ljb) - { /* for each local block column ... */ - jb = mycol + ljb * grid->npcol; /* not sure */ -#if 1 - // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam ) -#endif - { - if (jb < nsupers) - { - pc = PCOL(jb, grid); + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + double *dense, *dense_col; /* SPA */ + double zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/ - fsupc = FstBlockC(jb); - for (j = fsupc; j < FstBlockC(jb + 1); ++j) - { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j + 1]; ++i) - { - irow = usub[i]; /* First nonzero in the segment. */ - gb = BlockNum(irow); - pr = PROW(gb, grid); - ActiveFlagAll[pr + ljb * grid->nprow] = SUPERLU_MAX(ActiveFlagAll[pr + ljb * grid->nprow], gb); - } - } - pr = PROW(jb, grid); // take care of diagonal node stored as L + int *mod_bit; + int *frecv, *brecv; + int_t *lloc; + double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */ + long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */ + double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *Uinv_bc_dat; /* size sum of sizes of Uinv_bc_ptr[lk]) */ + long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */ + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + int tag; - ActiveFlagAll[pr + ljb * grid->nprow] = SUPERLU_MAX(ActiveFlagAll[pr + ljb * grid->nprow], jb); - } - } - } - for (ljb = 0; ljb < k; ++ljb) - { /* for each block column ... */ - jb = mycol + ljb * grid->npcol; /* not sure */ -#if 1 - // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam ) +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; #endif - { - if (jb < nsupers) - { - pc = PCOL(jb, grid); - // if ( mycol == pc ) { /* Block column jb in my process column */ - - for (j = 0; j < grid->nprow; ++j) - ActiveFlag[j] = ActiveFlagAll[j + ljb * grid->nprow]; - for (j = 0; j < grid->nprow; ++j) - ActiveFlag[j + grid->nprow] = j; - for (j = 0; j < grid->nprow; ++j) - ranks[j] = -1; - - Root = -1; - Iactive = 0; - for (j = 0; j < grid->nprow; ++j) - { - if (ActiveFlag[j] != -3 * nsupers) - { - gb = ActiveFlag[j]; - pr = PROW(gb, grid); - if (gb == jb) - Root = pr; - if (myrow == pr) - Iactive = 1; - } - } - - quickSortM(ActiveFlag, 0, grid->nprow - 1, grid->nprow, 1, 2); - - if (Iactive == 1) - { - - assert(Root > -1); - rank_cnt = 1; - ranks[0] = Root; - for (j = 0; j < grid->nprow; ++j) - { - if (ActiveFlag[j] != -3 * nsupers && ActiveFlag[j + grid->nprow] != Root) - { - ranks[rank_cnt] = ActiveFlag[j + grid->nprow]; - ++rank_cnt; - } - } - - if (rank_cnt > 1) - { - for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks - ranks[ii] = PNUM(ranks[ii], pc, grid); - - msgsize = SuperSize(jb); - - C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd'); - UBtree_ptr[ljb].tag_ = BC_U; - - if (Root == myrow) - { - rank_cnt_ref = 1; - for (j = 0; j < grid->nprow; ++j) - { - if (bsendx_plist[ljb][j] != SLU_EMPTY) - { - ++rank_cnt_ref; - } - } + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = (NRformat_loc *) A->Store; - assert(rank_cnt == rank_cnt_ref); - } - } - } - } - } - } - SUPERLU_FREE(ActiveFlag); - SUPERLU_FREE(ActiveFlagAll); - SUPERLU_FREE(ranks); - SUPERLU_FREE(SeedSTD_BC); - memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_BC, ActiveFlagAll +//#if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + dword = sizeof(double); +//#endif -#if (PROFlevel >= 1) - t = SuperLU_timer_() - t; - if (!iam) - printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter pddistribute_allgrid()"); #endif - -#if (PROFlevel >= 1) - t = SuperLU_timer_(); +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); #endif - /* construct the Reduce tree for U ... */ - /* the following is used as reference */ - nlb = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - if (!(mod_bit = int32Malloc_dist(nlb))) - ABORT("Malloc fails for mod_bit[]."); - if (!(brecv = int32Malloc_dist(nlb))) - ABORT("Malloc fails for brecv[]."); - - for (k = 0; k < nlb; ++k) - mod_bit[k] = 0; - for (k = 0; k < nsupers; ++k) - { - pr = PROW(k, grid); - if (myrow == pr) - { - lib = LBi(k, grid); /* local block number */ - kcol = PCOL(k, grid); - if (mycol == kcol || bmod[lib]) - mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ - } - } - /* Every process receives the count, but it is only useful on the - diagonal processes. */ - MPI_Allreduce(mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm); - - k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - if (!(URtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree)))) - ABORT("Malloc fails for URtree_ptr[]."); - if (!(ActiveFlag = intCalloc_dist(grid->npcol * 2))) - ABORT("Calloc fails for ActiveFlag[]."); - if (!(ranks = (int *)SUPERLU_MALLOC(grid->npcol * sizeof(int)))) - ABORT("Malloc fails for ranks[]."); - - if (!(SeedSTD_RD = (double *)SUPERLU_MALLOC(k * sizeof(double)))) - ABORT("Malloc fails for SeedSTD_RD[]."); - - for (i = 0; i < k; i++) - { - SeedSTD_RD[i] = rand(); - } - - MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_RD[0], k, MPI_DOUBLE, MPI_MAX, grid->rscp.comm); - for (lib = 0; lib < k; ++lib) - { - C_RdTree_Nullify(&URtree_ptr[lib]); - } - if (!(ActiveFlagAll = intMalloc_dist(grid->npcol * k))) - ABORT("Calloc fails for ActiveFlagAll[]."); - for (j = 0; j < grid->npcol * k; ++j) - ActiveFlagAll[j] = 3 * nsupers; - memTRS += k * sizeof(C_Tree) + k * dword + grid->npcol * k * iword; // acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll + dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, + grid, &xa, &asub, &a); - for (jb = 0; jb < nsupers; ++jb) - { /* for each block column ... */ -#if 1 - // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam ) +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf("--------\n" + ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); #endif - { - fsupc = FstBlockC(jb); - pc = PCOL(jb, grid); - fsupc = FstBlockC(jb); - for (j = fsupc; j < FstBlockC(jb + 1); ++j) - { - istart = xusub[j]; - /* NOTE: Only the first nonzero index of the segment - is stored in usub[]. */ - for (i = istart; i < xusub[j + 1]; ++i) - { - irow = usub[i]; /* First nonzero in the segment. */ - ib = BlockNum(irow); - pr = PROW(ib, grid); - if (myrow == pr) - { /* Block row ib in my process row */ - lib = LBi(ib, grid); /* Local block number */ - ActiveFlagAll[pc + lib * grid->npcol] = SUPERLU_MIN(ActiveFlagAll[pc + lib * grid->npcol], jb); - } - } - } + if ( options->Fact == SamePattern_SameRowPerm ) { - pr = PROW(jb, grid); - if (myrow == pr) - { /* Block row ib in my process row */ - lib = LBi(jb, grid); /* Local block number */ - ActiveFlagAll[pc + lib * grid->npcol] = SUPERLU_MIN(ActiveFlagAll[pc + lib * grid->npcol], jb); - } - } - } - - for (lib = 0; lib < k; ++lib) - { - ib = myrow + lib * grid->nprow; /* not sure */ -#if 1 - // if (superGridMap[ib] != NOT_IN_GRID || !grid3d->zscp.Iam) +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3,options))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + Unnz = Llu->Unnz; + + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3,options)*dword; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); #endif - { - if (ib < nsupers) - { - pr = PROW(ib, grid); - for (j = 0; j < grid->npcol; ++j) - ActiveFlag[j] = ActiveFlagAll[j + lib * grid->npcol]; - ; - for (j = 0; j < grid->npcol; ++j) - ActiveFlag[j + grid->npcol] = j; - for (j = 0; j < grid->npcol; ++j) - ranks[j] = -1; - Root = -1; - Iactive = 0; - - for (j = 0; j < grid->npcol; ++j) - { - if (ActiveFlag[j] != 3 * nsupers) - { - jb = ActiveFlag[j]; - pc = PCOL(jb, grid); - if (jb == ib) - Root = pc; - if (mycol == pc) - Iactive = 1; - } - } - quickSortM(ActiveFlag, 0, grid->npcol - 1, grid->npcol, 0, 2); + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + if(index){ + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif - if (Iactive == 1) - { - assert(Root > -1); - rank_cnt = 1; - ranks[0] = Root; - for (j = 0; j < grid->npcol; ++j) - { - if (ActiveFlag[j] != 3 * nsupers && ActiveFlag[j + grid->npcol] != Root) - { - ranks[rank_cnt] = ActiveFlag[j + grid->npcol]; - ++rank_cnt; - } - } - if (rank_cnt > 1) - { + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ +#if ( PROFlevel>=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + } /* for jb ... */ + + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + mem_use -= 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3,options)*dword; + +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); +#endif - for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks - ranks[ii] = PNUM(pr, ranks[ii], grid); + } else { /* fact is not SamePattern_SameRowPerm */ + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ - msgsize = SuperSize(ib); +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); + + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + + for (i = 0; i < j; ++i) index1[i] = SLU_EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); + + mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; + + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + ib = myrow+lb*grid->nprow; /* not sure */ + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + + if(superGridMap[ib]!= NOT_IN_GRID){ // YL: added supernode mask here + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + + mem_use += len*dword + (len1+1)*iword; + + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + }else{ + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); +#endif - C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd'); - URtree_ptr[lib].tag_ = RD_U; - if (Root == mycol) - { - assert(rank_cnt == brecv[lib]); - } - } + mem_use -= 2.0*k * iword; + + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3,options))) ) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = int32Calloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = int32Calloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); + + /* ------------------------------------------------ */ + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3,options)*dword; + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + Lnzval_bc_ptr[k-1] = NULL; + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (double**)SUPERLU_MALLOC(k * sizeof(double*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + if ( !(Unnz = + (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) + ABORT("Malloc fails for Unnz[]."); + + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index1 = int32Malloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index1[i] = SLU_EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index1[j]; + if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index1 = int32Malloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index1[i] = SLU_EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index1[j]; + /* -------------------------------------------------------------- */ + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + long int Linv_bc_cnt=0; + long int Uinv_bc_cnt=0; + long int Lrowind_bc_cnt=0; + long int Lnzval_bc_cnt=0; + long int Lindval_loc_bc_cnt=0; + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } /* for j ... */ + + jbrow = PROW( jb, grid ); + + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == SLU_EMPTY ) { + bsendx_plist[ljb][pr] = YES; + ++nbsendx; } + if ( myrow == pr) { // YL: added supernode mask here, TODO: double check bmod + if(superGridMap[gb]!= NOT_IN_GRID){ + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + }else{ + lb = LBi( gb, grid ); /* Local block number */ + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } + } + + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == SLU_EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; } - } - } - SUPERLU_FREE(mod_bit); - SUPERLU_FREE(brecv); - SUPERLU_FREE(ActiveFlag); - SUPERLU_FREE(ActiveFlagAll); - SUPERLU_FREE(ranks); - SUPERLU_FREE(SeedSTD_RD); - - memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_RD, ActiveFlagAll - -#if (PROFlevel >= 1) - t = SuperLU_timer_() - t; - if (!iam) - printf(".. Construct Reduce tree for U: %.2f\t\n", t); + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; #endif - } - //////////////////////////////////////////////////////// - - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lrowind_bc_dat = Lrowind_bc_dat; - Llu->Lrowind_bc_offset = Lrowind_bc_offset; - Llu->Lrowind_bc_cnt = Lrowind_bc_cnt; - - Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; - Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat; - Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset; - Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt; - - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Lnzval_bc_dat = Lnzval_bc_dat; - Llu->Lnzval_bc_offset = Lnzval_bc_offset; - Llu->Lnzval_bc_cnt = Lnzval_bc_cnt; - - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Ufstnz_br_dat = Ufstnz_br_dat; - Llu->Ufstnz_br_offset = Ufstnz_br_offset; - Llu->Ufstnz_br_cnt = Ufstnz_br_cnt; - - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->Unzval_br_dat = Unzval_br_dat; - Llu->Unzval_br_offset = Unzval_br_offset; - Llu->Unzval_br_cnt = Unzval_br_cnt; - - Llu->Unnz = Unnz; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - Llu->fmod = fmod; - Llu->fsendx_plist = fsendx_plist; - Llu->nfrecvx = nfrecvx; - Llu->nfsendx = nfsendx; - Llu->bmod = bmod; - Llu->bsendx_plist = bsendx_plist; - Llu->nbrecvx = nbrecvx; - Llu->nbsendx = nbsendx; - Llu->ilsum = ilsum; - Llu->ldalsum = ldaspa; - - Llu->LRtree_ptr = LRtree_ptr; - Llu->LBtree_ptr = LBtree_ptr; - Llu->URtree_ptr = URtree_ptr; - Llu->UBtree_ptr = UBtree_ptr; - - Llu->Linv_bc_ptr = Linv_bc_ptr; - Llu->Linv_bc_dat = Linv_bc_dat; - Llu->Linv_bc_offset = Linv_bc_offset; - Llu->Linv_bc_cnt = Linv_bc_cnt; - - Llu->Uinv_bc_ptr = Uinv_bc_ptr; - Llu->Uinv_bc_dat = Uinv_bc_dat; - Llu->Uinv_bc_offset = Uinv_bc_offset; - Llu->Uinv_bc_cnt = Uinv_bc_cnt; - - Llu->Urbs = Urbs; - Llu->Ucb_indptr = Ucb_indptr; - Llu->Ucb_inddat = Ucb_inddat; - Llu->Ucb_indoffset = Ucb_indoffset; - Llu->Ucb_indcnt = Ucb_indcnt; - Llu->Ucb_valptr = Ucb_valptr; - Llu->Ucb_valdat = Ucb_valdat; - Llu->Ucb_valoffset = Ucb_valoffset; - Llu->Ucb_valcnt = Ucb_valcnt; - -#ifdef GPU_ACC - if (!grid3d->zscp.Iam && 0) - { - checkGPU(gpuMalloc((void **)&Llu->d_xsup, (n + 1) * sizeof(int_t))); - checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); - checkGPU(gpuMalloc((void **)&Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); - checkGPU(gpuMalloc((void **)&Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); - checkGPU(gpuMalloc((void **)&Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); - checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); - checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); - checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); - checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t))); - checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t))); - checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); - checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); - checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); - checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); - - // some dummy allocation to avoid checking whether they are null pointers later - checkGPU(gpuMalloc((void **)&Llu->d_Ucolind_bc_dat, sizeof(int_t))); - checkGPU(gpuMalloc((void **)&Llu->d_Ucolind_bc_offset, sizeof(int64_t))); - checkGPU(gpuMalloc((void **)&Llu->d_Unzval_bc_dat, sizeof(double))); - checkGPU(gpuMalloc((void **)&Llu->d_Unzval_bc_offset, sizeof(int64_t))); - checkGPU(gpuMalloc((void **)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t))); - checkGPU(gpuMalloc((void **)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t))); - - checkGPU(gpuMalloc((void **)&Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); - checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); - checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); - checkGPU(gpuMalloc((void **)&Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t))); - checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); - - /* gpuMemcpy for the following is performed in pxgssvx */ - checkGPU(gpuMalloc((void **)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double))); - checkGPU(gpuMalloc((void **)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double))); - checkGPU(gpuMalloc((void **)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double))); - } + } else { + ++Lrb_length[lb]; + } + ++len; + } + } /* for i ... */ + + + if ( nrbl) { /* Do not ensure the blocks are sorted! */ + if(superGridMap[jb]!= NOT_IN_GRID){ // YL: added supernode mask here + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double)))) + ABORT("Malloc fails for lusup[]"); + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + myrow = MYROW( iam, grid ); + krow = PROW( jb, grid ); + if(myrow==krow){ /* diagonal block */ + if (!(Linv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); + if (!(Uinv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); + }else{ + Linv_bc_ptr[ljb] = NULL; + Uinv_bc_ptr[ljb] = NULL; + } + + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + mem_use += len*nsupc*dword + (len1)*iword; + memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb] + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_lind = BC_HEADER; + next_lval = 0; + for (k = 0; k < nrbl; ++k) { + gb = Lrb_number[k]; + lb = LBi( gb, grid ); + len = Lrb_length[lb]; + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; + Lrb_length[lb] = 0; /* Reset vector of block length */ + index[next_lind++] = gb; /* Descriptor */ + index[next_lind++] = len; + Lrb_indptr[lb] = next_lind; + Lrb_valptr[lb] = next_lval; + next_lind += len; + next_lval += len; + } + /* Propagate the compressed row subscripts to Lindex[], + and the initial values of A from SPA into Lnzval[]. */ + len = index[1]; /* LDA of lusup[] */ + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = Lrb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = Lrb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = 0.0; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] + and Lnzval_bc_ptr[ljb] here. */ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) + t_l += SuperLU_timer_() - t; #endif - -#if (PRNTlevel >= 1) - if (!iam) - printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", - nLblocks, nUblocks); + } /* if mycol == pc */ + + } /* for jb ... */ + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + + mem_use += nub * sizeof(Ucb_indptr_t *) + nub * sizeof(int_t *) + (2*nub)*iword; + + + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + // YL: no need to supernode mask here ???? + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + // YL: no need to add supernode mask here ???? + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + mem_use += Urbs[lb] * sizeof(Ucb_indptr_t) + (Urbs[lb])*iword; + }else{ + Ucb_valptr[lb]=NULL; + Ucb_indptr[lb]=NULL; + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + // printf("ID %5d lk %5d usub1 %10d\n",superGridMap[0],lk, usub1); + // YL: no need to add supernode mask here ???? + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + // printf("ID %5d lb %5d Urbs[lb] %10d\n",superGridMap[0],lb, Urbs[lb+nub]); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + // for (int lb = 0; lb < nub; ++lb) { + // printf("ID %5d lb %5d, superGridMap[lb] %5d, Unnz[lb] %5d\n",superGridMap[0],lb, superGridMap[lb], Unnz[lb]); + // } + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); #endif - SUPERLU_FREE(rb_marker); - SUPERLU_FREE(Urb_fstnz); - SUPERLU_FREE(Urb_length); - SUPERLU_FREE(Urb_indptr); - SUPERLU_FREE(Lrb_length); - SUPERLU_FREE(Lrb_number); - SUPERLU_FREE(Lrb_indptr); - SUPERLU_FREE(Lrb_valptr); - SUPERLU_FREE(dense); - - /* Find the maximum buffer size. */ - MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, - MPI_MAX, grid->comm); - - k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - if (!(Llu->mod_bit = int32Malloc_dist(k))) - ABORT("Malloc fails for mod_bit[]."); - -#if (PROFlevel >= 1) - if (!iam) - printf(".. 1st distribute time:\n " - "\tL\t%.2f\n\tU\t%.2f\n" - "\tu_blks %d\tnrbu %d\n--------\n", - t_l, t_u, u_blks, nrbu); + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); + + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + mem_use -= (k*8)*iword+ldaspa*sp_ienv_dist(3,options)*dword; + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = int32Malloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); #endif } /* else fact != SamePattern_SameRowPerm */ - if (xa[A->ncol] > 0) - { /* may not have any entries on this process. */ + if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ SUPERLU_FREE(asub); SUPERLU_FREE(a); } SUPERLU_FREE(xa); -#if (DEBUGlevel >= 1) +#if ( DEBUGlevel>=1 ) /* Memory allocated but not freed: ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ - CHECK_MALLOC(iam, "Exit pddistribute()"); + CHECK_MALLOC(iam, "Exit pddistribute_allgrid()"); #endif - return (mem_use + memTRS); + return (mem_use+memTRS); + +} /* PDDISTRIBUTE3D_Yang */ -} /* PDDISTRIBUTE */ diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index cced6e42..92593fb2 100755 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -23,6 +23,10 @@ at the top-level directory. */ #include "superlu_ddefs.h" #include "TRF3dV100/superlu_summit.h" +#include "pddistribute3d.h" +#include "ssvx3dAux.c" +int_t dgatherAllFactoredLU3d( dtrf3Dpartition_t* trf3Dpartition, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ); #include /*! \brief * @@ -497,8 +501,6 @@ at the top-level directory. * See superlu_ddefs.h for the definitions of varioous data types. * */ -// dSOLVEstruct3d_t * SOLVEstruct, -// SOLVEstruct->A3d int writeLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct) { @@ -714,10 +716,9 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; + Glu_freeable_t *Glu_freeable = NULL; /* The nonzero structures of L and U factors, which are replicated on all processrs. (lsub, xlsub) contains the compressed subscript of @@ -778,34 +779,8 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, /* Test the options choices. */ *info = 0; Fact = options->Fact; - if (Fact < 0 || Fact > FACTORED) - *info = -1; - else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) - *info = -1; - else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) - *info = -1; - else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) - *info = -1; - else if (options->IterRefine == SLU_EXTRA) - { - *info = -1; - fprintf(stderr, - "Extra precise iterative refinement yet to support."); - } - else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE) - *info = -2; - else if (ldb < Astore->m_loc) - *info = -5; - else if (nrhs < 0) - { - *info = -6; - } - if (*info) - { - i = -(*info); - pxerr_dist("pdgssvx3d", grid, -(*info)); - return; - } + + validateInput_ssvx3d(options, A, ldb, nrhs, grid3d, info); /* Initialization. */ @@ -826,7 +801,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, B is then aliased to B2d for the following 2D solve; */ dGatherNRformat_loc3d_allgrid(Fact, (NRformat_loc *)A->Store, - B, ldb, nrhs, grid3d, &A3d); + B, ldb, nrhs, grid3d, &A3d); B = (double *)A3d->B2d; /* B is now pointing to B2d, allocated in dGatherNRformat_loc3d. */ @@ -880,8 +855,13 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, Equil = (!factored && options->Equil == YES); notran = (options->Trans == NOTRANS); - iam = grid->iam; + + if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ + { + + /* The following code now works on 2D grid-0 */ + job = 5; /* Extract equilibration status from a previous factorization */ if (factored || (Fact == SamePattern_SameRowPerm && Equil)) @@ -896,154 +876,27 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, rowequ = colequ = FALSE; } + /* Not factored & ask for equilibration, then alloc RC */ + if (Equil && Fact != SamePattern_SameRowPerm) + dallocScalePermstruct_RC(ScalePermstruct, m, n); + /* The following arrays are replicated on all processes. */ perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; etree = LUstruct->etree; R = ScalePermstruct->R; C = ScalePermstruct->C; - /********/ - - /* Not factored & ask for equilibration */ - if (Equil && Fact != SamePattern_SameRowPerm) - { - /* Allocate storage if not done so before. */ - switch (ScalePermstruct->DiagScale) - { - case NOEQUIL: - if (!(R = (double *)doubleMalloc_dist(m))) - ABORT("Malloc fails for R[]."); - if (!(C = (double *)doubleMalloc_dist(n))) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if (!(C = (double *)doubleMalloc_dist(n))) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if (!(R = (double *)doubleMalloc_dist(m))) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - default: - break; - } - } /* ------------------------------------------------------------ Diagonal scaling to equilibrate the matrix. ------------------------------------------------------------ */ if (Equil) { -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Enter equil"); -#endif - t = SuperLU_timer_(); - - if (Fact == SamePattern_SameRowPerm) - { - /* Reuse R and C. */ - switch (ScalePermstruct->DiagScale) - { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - a[i] *= R[irow]; /* Scale rows. */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= C[icol]; /* Scale columns. */ - } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ - } - ++irow; - } - break; - } - } - else - { /* Compute R & C from scratch */ - /* Compute the row and column scalings. */ - pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + scaleMatrixDiagonally(Fact, ScalePermstruct, + A, stat, grid, &rowequ, &colequ, &iinfo); + if (iinfo < 0) + return; // return if error - if (iinfo > 0) - { - if (iinfo <= m) - { -#if (PRNTlevel >= 1) - fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); -#endif - } - else - { -#if (PRNTlevel >= 1) - fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo - n); -#endif - } - } - else if (iinfo < 0) - return; - - /* Now iinfo == 0 */ - - /* Equilibrate matrix A if it is badly-scaled. - A <-- diag(R)*A*diag(C) */ - pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - - if (strncmp(equed, "R", 1) == 0) - { - ScalePermstruct->DiagScale = ROW; - rowequ = ROW; - } - else if (strncmp(equed, "C", 1) == 0) - { - ScalePermstruct->DiagScale = COL; - colequ = COL; - } - else if (strncmp(equed, "B", 1) == 0) - { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } - else - ScalePermstruct->DiagScale = NOEQUIL; - -#if (PRNTlevel >= 1) - if (iam == 0) - { - printf(".. equilibrated? *equed = %c\n", *equed); - fflush(stdout); - } -#endif - } /* end if-else Fact ... */ - - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Exit equil"); -#endif } /* end if Equil ... LAPACK style, not involving MC64 */ if (!factored) @@ -1057,253 +910,28 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, if (Fact != SamePattern_SameRowPerm && (parSymbFact == NO || options->RowPerm != NO)) { - - need_value = (options->RowPerm == LargeDiag_MC64); - + int_t need_value = (options->RowPerm == LargeDiag_MC64); pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - GAstore = (NCformat *)GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; nnz = GAstore->nnz; - GA_mem_use = (nnz + n + 1) * sizeof(int_t); - - if (need_value) - { - a_GA = (double *)GAstore->nzval; - GA_mem_use += nnz * sizeof(double); - } - - else + GA_mem_use = (nnz + n + 1) * sizeof(int_t) + need_value * nnz * sizeof(double); + if (!need_value) assert(GAstore->nzval == NULL); } /* ------------------------------------------------------------ Find the row permutation for A. - ------------------------------------------------------------ */ - if (options->RowPerm != NO) - { - t = SuperLU_timer_(); - if (Fact != SamePattern_SameRowPerm) - { - if (options->RowPerm == MY_PERMR) - { - /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - else if (options->RowPerm == LargeDiag_MC64) - { - /* Get a new perm_r[] */ - if (job == 5) - { - /* Allocate storage for scaling factors. */ - if (!(R1 = doubleMalloc_dist(m))) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if (!(C1 = doubleMalloc_dist(n))) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if (iam == 0) - { - /* Process 0 finds a row permutation */ - iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm); - if (iinfo == 0) - { - MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm); - if (job == 5 && Equil) - { - MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm); - } - } - } - else - { - MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm); - if (iinfo == 0) - { - MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm); - if (job == 5 && Equil) - { - MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm); - } - } - } - - if (iinfo && job == 5) - { /* Error return */ - SUPERLU_FREE(R1); - SUPERLU_FREE(C1); - } -#if (PRNTlevel >= 2) - dmin = damch_dist("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if (iinfo == 0) - { - if (job == 5) - { - if (Equil) - { - for (i = 0; i < n; ++i) - { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - - /* Scale the distributed matrix further. - A <-- diag(R1)*A*diag(C1) */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= R1[irow] * C1[icol]; -#if (PRNTlevel >= 2) - if (perm_r[irow] == icol) - { - /* New diagonal */ - if (job == 2 || job == 3) - dmin = SUPERLU_MIN(dmin, fabs(a[i])); - else if (job == 4) - dsum += fabs(a[i]); - else if (job == 5) - dprod *= fabs(a[i]); - } -#endif - } - ++irow; - } - - /* Multiply together the scaling factors -- - R/C from simple scheme, R1/C1 from MC64. */ - if (rowequ) - for (i = 0; i < m; ++i) - R[i] *= R1[i]; - else - for (i = 0; i < m; ++i) - R[i] = R1[i]; - if (colequ) - for (i = 0; i < n; ++i) - C[i] *= C1[i]; - else - for (i = 0; i < n; ++i) - C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end if Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) - { - for (i = colptr[j]; i < colptr[j + 1]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE(R1); - SUPERLU_FREE(C1); - } - else - { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) - { - for (i = colptr[j]; i < colptr[j + 1]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - } - else - { /* if iinfo != 0 */ - for (i = 0; i < m; ++i) - perm_r[i] = i; - } -#if (PRNTlevel >= 2) - if (job == 2 || job == 3) - { - if (!iam) - printf("\tsmallest diagonal %e\n", dmin); - } - else if (job == 4) - { - if (!iam) - printf("\tsum of diagonal %e\n", dsum); - } - else if (job == 5) - { - if (!iam) - printf("\t product of diagonal %e\n", dprod); - } -#endif - } - else - { /* use LargeDiag_HWPM */ -#ifdef HAVE_COMBBLAS - d_c2cpp_GetHWPM(A, grid, ScalePermstruct); -#else - if (iam == 0) - { - printf("CombBLAS is not available\n"); - fflush(stdout); - } -#endif - } /* end if-else options->RowPerm ... */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if (PRNTlevel >= 1) - if (!iam) - { - printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); - fflush(stdout); - } -#endif - } /* end if Fact not SamePattern_SameRowPerm ... */ - } - else - { /* options->RowPerm == NOROWPERM / NATURAL */ - for (i = 0; i < m; ++i) - perm_r[i] = i; - } + ------------------------------------------------------------ */ + perform_row_permutation( + options, Fact, ScalePermstruct, LUstruct, + m, n, grid, A, &GA, stat, job, Equil, + &rowequ, &colequ, &iinfo); -#if (DEBUGlevel >= 2) - if (!iam) - PrintInt10("perm_r", m, perm_r); -#endif } /* end if (!factored) */ + /* Compute norm(A), which will be used to adjust small diagonal. */ if (!factored || options->IterRefine) - { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if (notran) - *(unsigned char *)norm = '1'; - else - *(unsigned char *)norm = 'I'; - anorm = pdlangs(norm, A, grid); -#if (PRNTlevel >= 1) - if (!iam) - { - printf(".. anorm %e\n", anorm); - fflush(stdout); - } -#endif - } + anorm = computeA_Norm(notran, A, grid); /* ------------------------------------------------------------ Perform ordering and symbolic factorization @@ -1324,6 +952,10 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, if (parSymbFact == YES || permc_spec == PARMETIS) { + if(grid3d->npdep!=1){ + fprintf(stderr, "Error: ParMETIS and Parallel Symbolic Factorization are not yet supported with grid3d->npdep>1.\n"); + return; // or exit(-1); if you want to terminate the program + } nprocs_num = grid->nprow * grid->npcol; noDomains = (int)(pow(2, ((int)LOG2(nprocs_num)))); @@ -1395,77 +1027,19 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, { if (parSymbFact == NO) { - - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder(options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *)GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) - { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) - { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; - } - } - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ -#if (PRNTlevel >= 1) - if (!iam) - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2, options), sp_ienv_dist(3, options), sp_ienv_dist(6, options)); -#endif - t = SuperLU_timer_(); - if (!(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) - ABORT("Malloc fails for Glu_freeable."); - - /* Every process does this. */ - iinfo = symbfact(options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (iinfo < 0) - { - /* Successful return */ - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); - -#if (PRNTlevel >= 1) - if (!iam) - { - printf("\tNo of supers %ld\n", - (long)Glu_persist->supno[n - 1] + 1); - printf("\tSize of G(L) %ld\n", (long)Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", (long)Glu_freeable->xusub[n]); - printf("\tint %lu, short %lu, float %lu, double %lu\n", - sizeof(int_t), sizeof(short), - sizeof(float), sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu * 1e-6, - symb_mem_usage.total * 1e-6, - symb_mem_usage.expansions); - } -#endif - } - else - { - if (!iam) - { - fprintf(stderr, "symbfact() error returns %d\n", - (int)iinfo); - exit(-1); - } - } + /*Allocating Glu_freeable used by symbfact */ + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) + ABORT("Malloc fails for Glu_freeable."); + permCol_SymbolicFact3d(options, n, &GA, perm_c, etree, + Glu_persist, Glu_freeable, stat, + &symb_mem_usage, + grid3d); } /* end serial symbolic factorization */ else { /* parallel symbolic factorization */ + //TODO: need a 3D version of symbfact_dist t = SuperLU_timer_(); flinfo = symbfact_dist(options, nprocs_num, noDomains, @@ -1481,120 +1055,127 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, /* Destroy GA */ if (parSymbFact == NO || options->RowPerm != NO) Destroy_CompCol_Matrix_dist(&GA); - if (parSymbFact == NO) - Destroy_CompCol_Permuted_dist(&GAC); } /* end if Fact not SamePattern_SameRowPerm */ + } /* end if not Factored */ + } /* end 2D process layer 0 */ + + MPI_Bcast(&rowequ, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast(&colequ, 1, mpi_int_t, 0, grid3d->zscp.comm); -#if (DEBUGlevel >= 2) // Sherry - if (!iam) - PrintInt10("perm_c", m, perm_c); -#endif - if (sizes) - SUPERLU_FREE(sizes); - if (fstVtxSep) - SUPERLU_FREE(fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free(&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) + /* Broadcast Permuted A and symbolic factorization data from 2d to 3d grid*/ + if (Fact != SamePattern_SameRowPerm && !factored) // place the exact conditions later //all the grid must execute this + { + if (parSymbFact == NO){ + if (Glu_freeable == NULL) { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) - colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_(); - - nsupers = getNsupers(n, LUstruct->Glu_persist); - int* supernodeMask; - if(Fact == SamePattern_SameRowPerm){ - supernodeMask=trf3Dpartition->supernodeMask; - dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid, supernodeMask); - - }else{ - - // First call of pddistribute_allgrid with a prefixed supernodeMask - // YL: this first call can be removed with Piyush's cleaner fix - int* supernodeMask = int32Calloc_dist(nsupers); - for (int i=0;izscp.Iam == i%grid3d->npdep) - supernodeMask[i]=1; - } - dist_mem_use = pddistribute_allgrid_index_only(options, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid, supernodeMask); - SUPERLU_FREE(supernodeMask); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) + ABORT("Malloc fails for Glu_freeable."); + } + bcastPermutedSparseA(A, + ScalePermstruct, + Glu_freeable, + LUstruct, grid3d); + }else{ + //TODO: need a parmetis version of bcastPermutedSparseA broadcasting Pslu_freeable + } + } - // Generate the 3D partition - dDestroy_trf3Dpartition(LUstruct->trf3Dpart); - trf3Dpartition = dinitTrf3Dpartition_allgrid(n, options, LUstruct, grid3d); - LUstruct->trf3Dpart=trf3Dpartition; - - // Delete the meta data generated by pddistribute_allgrid - dLocalLU_t *Llu = LUstruct->Llu; - for (int jb = 0; jb < CEILING( nsupers, grid->npcol ); ++jb) { /* for each block column ... */ - if ( Llu->Lrowind_bc_ptr[jb] ) { - SUPERLU_FREE (Llu->Lrowind_bc_ptr[jb]); - } - } - SUPERLU_FREE (Llu->Lrowind_bc_ptr); - for (int lb = 0; lb < CEILING( nsupers, grid->nprow ); ++lb) { /* for each block row ... */ - if(Llu->Ufstnz_br_ptr[lb]!=NULL) - SUPERLU_FREE(Llu->Ufstnz_br_ptr[lb]); - } - SUPERLU_FREE(Llu->Ufstnz_br_ptr); + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + nsupers = getNsupers(n, LUstruct->Glu_persist); + Astore = (NRformat_loc *)A->Store; + a = (double *)Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + Glu_persist = LUstruct->Glu_persist; + // perform the 3D distribution + if (!factored) + { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + // free quauntities used in Parmetis + if (sizes) + SUPERLU_FREE(sizes); + if (fstVtxSep) + SUPERLU_FREE(fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free(&symb_comm); + if ( Fact != SamePattern_SameRowPerm){ + LUstruct->trf3Dpart = SUPERLU_MALLOC(sizeof(dtrf3Dpartition_t)); + // computes the new partition for 3D factorization here + trf3Dpartition=LUstruct->trf3Dpart; + newTrfPartitionInit(nsupers, LUstruct, grid3d); + } + } - // Second call of pddistribute_allgrid with the final supernodeMask - dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid, trf3Dpartition->supernodeMask); - /* now that LU structure has been scattered, initialize the LU and buffers */ - dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs, - trf3Dpartition->sForests, LUstruct, grid3d); - dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t)); - dLluBufInit(LUvsb, LUstruct); - trf3Dpartition->LUvsb = LUvsb; - } + // perform the 3D distribution + if (!factored) + { /* Skip this if already factored. */ + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) + { - stat->utime[DIST] = SuperLU_timer_() - t; - /* Deallocate storage used in symbolic factorization. */ - if (Fact != SamePattern_SameRowPerm) - { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_(); - } - else + dist_mem_use = pddistribute3d_Yang(options, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid3d); + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) - colind[j] = perm_c[colind[j]]; + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + } + else + { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + + // TODO: need a 3D version of ddist_psymbtonum + t = SuperLU_timer_(); + dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT("Not enough memory available for dist_psymbtonum\n"); - t = SuperLU_timer_(); - dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT("Not enough memory available for dist_psymbtonum\n"); + stat->utime[DIST] = SuperLU_timer_() - t; - stat->utime[DIST] = SuperLU_timer_() - t; + ABORT("ddist_psymbtonum does not yet work with 3D factorization\n"); - ABORT("ddist_psymbtonum does not yet work with 3D factorization\n"); + } - } + if(Fact != SamePattern_SameRowPerm){ + // checkDist3DLUStruct(LUstruct, grid3d); + // zeros out the Supernodes that are not owned by the grid + dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs, + trf3Dpartition->sForests, LUstruct, grid3d); + + dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t)); + dLluBufInit(LUvsb, LUstruct); + trf3Dpartition->LUvsb = LUvsb; + trf3Dpartition->iperm_c_supno = create_iperm_c_supno(nsupers, options, LUstruct, grid3d); + } - /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + + MPI_Bcast(&anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); /* Perform numerical factorization in parallel on all process layers.*/ @@ -1721,7 +1302,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, if (writeLU) { if (!grid3d->zscp.Iam) - writeLUtoDisk(nsupers, Glu_persist->xsup, LUstruct); + writeLUtoDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct); } int checkLU = 0; @@ -1733,7 +1314,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, if (checkLU) { if (!grid3d->zscp.Iam) - checkLUFromDisk(nsupers, Glu_persist->xsup, LUstruct); + checkLUFromDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct); } #if (PRNTlevel >= 0) @@ -2218,7 +1799,7 @@ if (getenv("SUPERLU_ACC_SOLVE")){ SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) ABORT ("Malloc fails for gstrs_comm[]"); pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); + LUstruct->Glu_persist, SOLVEstruct1); if (getenv("SUPERLU_ACC_SOLVE")){ int_t nsupers = getNsupers(n, LUstruct->Glu_persist); pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,trf3Dpartition->supernodeMask); @@ -2388,7 +1969,7 @@ if (getenv("SUPERLU_ACC_SOLVE")){ SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) ABORT ("Malloc fails for gstrs_comm[]"); pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); + LUstruct->Glu_persist, SOLVEstruct1); if (getenv("SUPERLU_ACC_SOLVE")){ int_t nsupers = getNsupers(n, LUstruct->Glu_persist); int* supernodeMask = int32Malloc_dist(nsupers); diff --git a/SRC/pdgssvx3d_1pass_Yang.c b/SRC/pdgssvx3d_1pass_Yang.c new file mode 100755 index 00000000..a3cea62b --- /dev/null +++ b/SRC/pdgssvx3d_1pass_Yang.c @@ -0,0 +1,2116 @@ + +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Solves a system of linear equations A*X=B using 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.2) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021
+ * Last update: November 8, 2021  v7.2.0
+ */
+#include "superlu_ddefs.h"
+#include "TRF3dV100/superlu_summit.h"
+#include "pddistribute3d.h"
+#include "ssvx3dAux.c"
+int_t dgatherAllFactoredLU3d( dtrf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT );
+#include 
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PDGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (dLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util_dist.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +int writeLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct) +{ + + if (getenv("LUFILE")) + { + FILE *fp = fopen(getenv("LUFILE"), "w"); + printf("writing to %s", getenv("LUFILE")); + for (int i = 0; i < nsupers; i++) + { + if (LUstruct->Llu->Lrowind_bc_ptr[i]) + { + int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; + double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(i) * len; + fwrite(nzval, sizeof(double), len2, fp); // assume fp will be incremented + } + + if (LUstruct->Llu->Ufstnz_br_ptr[i]) + { + int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; + double *nzval = LUstruct->Llu->Unzval_br_ptr[i]; + int_t lenv = usub[1]; + + fwrite(nzval, sizeof(double), lenv, fp); // assume fp will be incremented + } + } + + fclose(fp); + } + else + { + printf("Please set environment variable LUFILE to write\n..bye bye"); + exit(0); + } + + return 0; +} + +#define EPSILON 1e-3 + +static int checkArr(double *A, double *B, int n) +{ + for (int i = 0; i < n; i++) + { + assert(fabs(A[i] - B[i]) <= EPSILON * SUPERLU_MIN(fabs(A[i]), fabs(B[i]))); + } + + return 0; +} + +int checkLUFromDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct) +{ + dLocalLU_t *Llu = LUstruct->Llu; + + double *Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]); + double *Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]); + + if (getenv("LUFILE")) + { + FILE *fp = fopen(getenv("LUFILE"), "r"); + printf("reading from %s", getenv("LUFILE")); + for (int i = 0; i < nsupers; i++) + { + if (LUstruct->Llu->Lrowind_bc_ptr[i]) + { + int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; + double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(i) * len; + fread(Lval_buf, sizeof(double), len2, fp); // assume fp will be incremented + checkArr(nzval, Lval_buf, len2); + } + + if (LUstruct->Llu->Ufstnz_br_ptr[i]) + { + int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; + double *nzval = LUstruct->Llu->Unzval_br_ptr[i]; + int_t lenv = usub[1]; + + fread(Uval_buf, sizeof(double), lenv, fp); // assume fp will be incremented + checkArr(nzval, Uval_buf, lenv); + } + } + printf("CHecking LU from %s is succesful ", getenv("LUFILE")); + fclose(fp); + } + else + { + printf("Please set environment variable LUFILE to read\n..bye bye"); + exit(0); + } + + return 0; +} + + +/*! \brief Dump the factored matrix L using matlab triple-let format + */ +void dDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d, + Glu_persist_t *Glu_persist, dLocalLU_t *Llu) +{ + register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; + int k, mycol, r, n, nmax; + int_t nnzL; + int_t *xsup = Glu_persist->xsup; + int_t *index; + double *nzval; + char filename[256]; + FILE *fp, *fopen(); + gridinfo_t *grid = &(grid3d->grid2d); + int iam = grid->iam; + int iam3d = grid3d->iam; + + // assert(grid->npcol*grid->nprow==1); + + // count nonzeros in the first pass + nnzL = 0; + n = 0; + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; i=xsup[gb]+j+1){ + nnzL ++; + nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1); + n = nmax; + } + + } + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + } + MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm); + MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm); + + snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d); + printf("Dumping L factor to --> %s\n", filename); + if ( !(fp = fopen(filename, "w")) ) { + ABORT("File open failed"); + } + + if(grid->iam==0){ + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); + } + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; iStore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + double *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable = NULL; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + double *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + double *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; + float flinfo; /* track memory usage of parallel symbolic factorization */ + bool Solve3D = true; + int_t nsupers; +#if (PRNTlevel >= 2) + double dmin, dsum, dprod; +#endif + + dtrf3Dpartition_t *trf3Dpartition=LUstruct->trf3Dpart; + int gpu3dVersion = 0; + #ifdef GPU_ACC + // gpu3dVersion = 1; + if (getenv("GPU3DVERSION")) + { + gpu3dVersion = atoi(getenv("GPU3DVERSION")); + } + + LUgpu_Handle LUgpu; + #endif + + + LUstruct->dt = 'd'; + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + validateInput_ssvx3d(options, A, ldb, nrhs, grid3d, info); + + /* Initialization. */ + + options->Algo3d = YES; + + /* definition of factored seen by each process layer */ + factored = (Fact == FACTORED); + + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d, + so that the names {ldb, B, and Astore} can be used internally. + B3d and Astore3d will be assigned back to B and Astore on return.*/ + int ldb3d = ldb; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + NRformat_loc3d *A3d = SOLVEstruct->A3d; + + /* B3d is aliased to B; + B2d is allocated; + B is then aliased to B2d for the following 2D solve; + */ + dGatherNRformat_loc3d_allgrid(Fact, (NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d, &A3d); + + B = (double *)A3d->B2d; /* B is now pointing to B2d, + allocated in dGatherNRformat_loc3d. */ + // PrintDouble5("after gather B=B2d", ldb, B); + + SOLVEstruct->A3d = A3d; /* This structure need to be persistent across + multiple calls of pdgssvx3d() */ + + NRformat_loc *Astore0 = A3d->A_nfmt; // on all grids + NRformat_loc *A_orig = A->Store; + ////// + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Enter pdgssvx3d()"); +#endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: + - equilibration, + - ordering, + - symbolic factorization, + - distribution of L & U */ + + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); + + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; // on all grids + ldb = Astore0->m_loc; + + /* The following code now works on all grids */ + Astore = (NRformat_loc *)A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (double *)Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + /* Extract equilibration status from a previous factorization */ + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + { + rowequ = colequ = FALSE; + } + + /* Not factored & ask for equilibration, then alloc RC */ + if (Equil && Fact != SamePattern_SameRowPerm) + dallocScalePermstruct_RC(ScalePermstruct, m, n); + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) + { + scaleMatrixDiagonally(Fact, ScalePermstruct, + A, stat, grid, &rowequ, &colequ, &iinfo); + if (iinfo < 0) + return; // return if error + + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if (!factored) + { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) + { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); + + GAstore = (NCformat *)GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof(int_t); + + if (need_value) + { + a_GA = (double *)GAstore->nzval; + GA_mem_use += nnz * sizeof(double); + } + + else + assert(GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + perform_row_permutation( + options, Fact, ScalePermstruct, LUstruct, + m, n, grid, A, &GA, stat, job, Equil, + &rowequ, &colequ, &iinfo); + + } /* end if (!factored) */ + + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (!factored || options->IterRefine) + anorm = computeA_Norm(notran, A, grid); + + /* ------------------------------------------------------------ + Perform ordering and symbolic factorization + ------------------------------------------------------------ */ + if (!factored) + { + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) + { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int)(pow(2, ((int)LOG2(nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split(grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) + { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist(2 * noDomains))) + ABORT("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist(2 * noDomains))) + ABORT("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) + { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } + else if (permc_spec != PARMETIS) + { + /* same as before */ + printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int)MYROW(grid->iam, grid), (int)MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + + if (permc_spec != MY_PERMC && Fact == DOFACT) + { + if (permc_spec == PARMETIS) + { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT("ERROR in get perm_c parmetis."); + } + else + { + get_perm_c_dist(iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_() - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) + { + if (parSymbFact == NO) + { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder(options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *)GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) + { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) + { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if (PRNTlevel >= 1) + if (!iam) + printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2, options), sp_ienv_dist(3, options), sp_ienv_dist(6, options)); +#endif + t = SuperLU_timer_(); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) + ABORT("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact(options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if (iinfo < 0) + { + /* Successful return */ + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if (PRNTlevel >= 1) + if (!iam) + { + printf("\tNo of supers %ld\n", + (long)Glu_persist->supno[n - 1] + 1); + printf("\tSize of G(L) %ld\n", (long)Glu_freeable->xlsub[n]); + printf("\tSize of G(U) %ld\n", (long)Glu_freeable->xusub[n]); + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } + else + { + if (!iam) + { + fprintf(stderr, "symbfact() error returns %d\n", + (int)iinfo); + exit(-1); + } + } + + } /* end serial symbolic factorization */ + else + { /* parallel symbolic factorization */ + t = SuperLU_timer_(); + flinfo = + symbfact_dist(options, nprocs_num, noDomains, + A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if (flinfo > 0) + ABORT("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist(&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist(&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + +#if (DEBUGlevel >= 2) // Sherry + if (!iam) + PrintInt10("perm_c", m, perm_c); +#endif + if (sizes) + SUPERLU_FREE(sizes); + if (fstVtxSep) + SUPERLU_FREE(fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free(&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) + { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_(); + + nsupers = getNsupers(n, LUstruct->Glu_persist); + + if(Fact != SamePattern_SameRowPerm){ + LUstruct->trf3Dpart = SUPERLU_MALLOC(sizeof(dtrf3Dpartition_t)); + newTrfPartitionInit(nsupers, LUstruct, grid3d); + trf3Dpartition=LUstruct->trf3Dpart; + } + + dist_mem_use = pddistribute3d_Yang(options, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid3d); + + if(Fact != SamePattern_SameRowPerm){ + /* now that LU structure has been scattered, initialize the LU and buffers */ + dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs, + trf3Dpartition->sForests, LUstruct, grid3d); + dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t)); + dLluBufInit(LUvsb, LUstruct); + trf3Dpartition->LUvsb = LUvsb; + trf3Dpartition->iperm_c_supno = create_iperm_c_supno(nsupers, options, LUstruct, grid3d); + } + + + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + + } + else + { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_(); + dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_() - t; + + ABORT("ddist_psymbtonum does not yet work with 3D factorization\n"); + + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + + + /* Perform numerical factorization in parallel on all process layers.*/ + + /* nvshmem related. The nvshmem_malloc has to be called before trs_compute_communication_structure, otherwise solve is much slower*/ + #ifdef HAVE_NVSHMEM + int nc = CEILING( nsupers, grid->npcol); + int nr = CEILING( nsupers, grid->nprow); + int flag_bc_size = RDMA_FLAG_SIZE * (nc+1); + int flag_rd_size = RDMA_FLAG_SIZE * nr * 2; + int my_flag_bc_size = RDMA_FLAG_SIZE * (nc+1); + int my_flag_rd_size = RDMA_FLAG_SIZE * nr * 2; + int maxrecvsz = sp_ienv_dist(3, options)* nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + int ready_x_size = maxrecvsz*nc; + int ready_lsum_size = 2*maxrecvsz*nr; + if (getenv("SUPERLU_ACC_SOLVE")){ + nv_init_wrapper(grid->comm); + prepare_multiGPU_buffers(flag_bc_size,flag_rd_size,ready_x_size,ready_lsum_size,my_flag_bc_size,my_flag_rd_size); + } + #endif + + + + + + SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if (PRNTlevel >= 1) + if (grid3d->iam == 0) + { + printf("after 3D initialization.\n"); + fflush(stdout); + } +#endif + + + + + + t = SuperLU_timer_(); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + // get environment variable TRF3DVERSION +#ifdef GPU_ACC + if (gpu3dVersion == 1) + { /* this is the new C++ code in TRF3dV100/ directory */ + + if (!grid3d->iam) + printf("Using pdgstrf3d+gpu version 1 for Summit\n"); +#if 0 + pdgstrf3d_summit(options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); +#else + int_t ldt = sp_ienv_dist(3, options); /* Size of maximum supernode */ + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + /* call constructor in C++ code */ + LUgpu = createLUgpuHandle(nsupers, ldt, trf3Dpartition, LUstruct, grid3d, + SCT, options, stat, thresh, info); + + /* call pdgstrf3d() in C++ code */ + pdgstrf3d_LUpackedInterface(LUgpu); + + copyLUGPU2Host(LUgpu, LUstruct); + destroyLUgpuHandle(LUgpu); + + // print other stuff + // if (!grid3d->zscp.Iam) + // SCT_printSummary(grid, SCT); + reduceStat(FACT, stat, grid3d); + +#endif + } + else /* this is the old C code, with less GPU offload */ +#endif /* matching ifdef GPU_ACC */ + { + + pdgstrf3d(options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + + // dDumpLblocks3D(nsupers, grid3d, LUstruct->Glu_persist, LUstruct->Llu); + + + } + if (getenv("NEW3DSOLVE")){ + dbroadcastAncestor3d(trf3Dpartition, LUstruct, grid3d, SCT); + } + + if ( options->Fact != SamePattern_SameRowPerm) { + if (getenv("NEW3DSOLVE") && Solve3D==true){ + trs_compute_communication_structure(options, n, LUstruct, + ScalePermstruct, trf3Dpartition->supernodeMask, grid, stat); + }else{ + int* supernodeMask = int32Malloc_dist(nsupers); + for(int ii=0; iiutime[FACT] = SuperLU_timer_() - t; + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + double tgather = SuperLU_timer_(); + if(Solve3D==false){ + dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + } + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + + // Write LU to file + int writeLU = 0; + if (getenv("WRITELU")) + { + writeLU = atoi(getenv("WRITELU")); + } + + if (writeLU) + { + if (!grid3d->zscp.Iam) + writeLUtoDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct); + } + + int checkLU = 0; + if (getenv("CHECKLU")) + { + checkLU = atoi(getenv("CHECKLU")); + } + + if (checkLU) + { + if (!grid3d->zscp.Iam) + checkLUFromDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct); + } + +#if (PRNTlevel >= 0) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + d3D_printMemUse(trf3Dpartition, LUstruct, grid3d); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + SCT_free(SCT); + + } /* end if not Factored ... factor on all process layers */ + + if (grid3d->zscp.Iam == 0 ) + { // only process layer 0 + if (!factored) + { + if (options->PrintStat) + { + int_t TinyPivots; + float for_lu, total, avg, loc_max; + float mem_stage[3]; + struct { float val; int rank; } local_struct, global_struct; + + MPI_Reduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, 0, grid->comm ); + stat->TinyPivots = TinyPivots; + + /*-- Compute high watermark of all stages --*/ + if (parSymbFact == TRUE) + { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical + factorization */ + mem_stage[0] = (-flinfo); /* symbfact step */ + mem_stage[1] = (-dist_mem_use); /* distribution step */ + loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1]); + if (options->RowPerm != NO ) + loc_max = SUPERLU_MAX(loc_max, GA_mem_use); + } + else + { + mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */ + mem_stage[1] = symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu; /* distribution step */ + loc_max = SUPERLU_MAX(mem_stage[0], mem_stage[1] ); + } + + dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + mem_stage[2] = num_mem_usage.total; /* numerical factorization step */ + + loc_max = SUPERLU_MAX(loc_max, mem_stage[2] ); /* local max of 3 stages */ + + local_struct.val = loc_max; + local_struct.rank = grid->iam; + MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm ); + int all_highmark_rank = global_struct.rank; + float all_highmark_mem = global_struct.val * 1e-6; + + MPI_Reduce( &loc_max, &avg, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + + /*-- Compute memory usage of numerical factorization --*/ + local_struct.val = num_mem_usage.for_lu; + MPI_Reduce(&local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm); + int lu_max_rank = global_struct.rank; + float lu_max_mem = global_struct.val * 1e-6; + + local_struct.val = stat->peak_buffer; + MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm ); + int buffer_peak_rank = global_struct.rank; + float buffer_peak = global_struct.val*1e-6; + if (iam == 0) + { + printf("\n** Memory Usage **********************************\n"); + printf("** Total highmark (MB):\n" + " Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, + all_highmark_mem); + printf(" Max at rank %d, different stages (MB):\n" + "\t. symbfact %8.2f\n" + "\t. distribution %8.2f\n" + "\t. numfact %8.2f\n", + all_highmark_rank, mem_stage[0] * 1e-6, mem_stage[1] * 1e-6, mem_stage[2] * 1e-6); + printf("** NUMfact space (MB): (sum-of-all-processes)\n" + " L\\U : %8.2f | Total : %8.2f\n", + for_lu * 1e-6, total * 1e-6); + printf("\t. max at rank %d, max L+U memory (MB): %8.2f\n" + "\t. max at rank %d, peak buffer (MB): %8.2f\n", + lu_max_rank, lu_max_mem, + buffer_peak_rank, buffer_peak); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); + fflush(stdout); + } + } /* end printing stats */ + + } /* end if not Factored */ + } + + if(Solve3D){ + + if ( options->Fact == DOFACT || options->Fact == SamePattern ) { + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ + dSolveFinalize(options, SOLVEstruct); /* Clean up structure */ + pdgstrs_delete_device_lsum_x(SOLVEstruct); + options->SolveInitialized = NO; /* Reset the solve state */ + } + } + + if (getenv("NEW3DSOLVE")){ + + + if (options->DiagInv == YES && (Fact != FACTORED)) + { + pdCompute_Diag_Inv(n, LUstruct, grid, stat, info); + + // The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d. + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); +#if (defined(GPU_ACC) && defined(GPU_SOLVE)) + + pdconvertU(options, grid, LUstruct, stat, n); + + // checkGPU(gpuFree(LUstruct->Llu->d_xsup)); + // checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked)); + // checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_ilsum)); + // checkGPU(gpuFree(LUstruct->Llu->d_grid)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat)); + + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice)); +#endif +if (getenv("SUPERLU_ACC_SOLVE")){ +#ifdef GPU_ACC + checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat, + (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat, + (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat, + (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); +#endif +} + } + } + }else{ /* if(Solve3D) */ + + if (grid3d->zscp.Iam == 0){ /* on 2D grid-0 */ + + if ( options->Fact == DOFACT || options->Fact == SamePattern ) { + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ + dSolveFinalize(options, SOLVEstruct); /* Clean up structure */ + pdgstrs_delete_device_lsum_x(SOLVEstruct); + options->SolveInitialized = NO; /* Reset the solve state */ + } + } + +#if (defined(GPU_ACC) && defined(GPU_SOLVE)) + if (options->DiagInv == NO) + { + if (iam == 0) + { + printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n"); + printf(" otherwise, use CPU trisolve\n"); + fflush(stdout); + } + // exit(0); // Sherry: need to return an error flag + } +#endif + + if (options->DiagInv == YES && (Fact != FACTORED)) + { + pdCompute_Diag_Inv(n, LUstruct, grid, stat, info); + + // The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d. + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); +#ifdef GPU_ACC + + pdconvertU(options, grid, LUstruct, stat, n); + + // checkGPU(gpuFree(LUstruct->Llu->d_xsup)); + // checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked)); + // checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_ilsum)); + // checkGPU(gpuFree(LUstruct->Llu->d_grid)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat)); + + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice)); +#endif + +if (getenv("SUPERLU_ACC_SOLVE")){ +#ifdef GPU_ACC + + checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat, + (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat, + (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat, + (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); +#endif +} + } + } + } + + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if ((nrhs > 0) && (*info == 0)) + { + if (options->SolveInitialized == NO){ + if (getenv("SUPERLU_ACC_SOLVE")){ + if (getenv("NEW3DSOLVE") && Solve3D==true){ + pdgstrs_init_device_lsum_x(options, n, m_loc, nrhs, grid,LUstruct, SOLVEstruct,trf3Dpartition->supernodeMask); + }else{ + int* supernodeMask = int32Malloc_dist(nsupers); + for(int ii=0; iiutime[SOLVE] = 0.0; + if(Solve3D){ + + // if (!(b_work = doubleMalloc_dist(n))) + // ABORT("Malloc fails for b_work[]"); + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs))) + ABORT("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + + if (options->SolveInitialized == NO) + /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + if (getenv("NEW3DSOLVE")){ + pdgstrs3d_newsolve (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X, + m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info); + }else{ + pdgstrs3d (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X, + m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info); + } + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (dSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + LUstruct->Glu_persist, SOLVEstruct1); + if (getenv("SUPERLU_ACC_SOLVE")){ + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,trf3Dpartition->supernodeMask); + } + } + + pdgsrfs3d (options, n, A, anorm, LUstruct, ScalePermstruct, grid3d, trf3Dpartition, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pdgstrs_delete_device_lsum_x(SOLVEstruct1); + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + }else{ + + if (grid3d->zscp.Iam == 0){ /* on 2D grid-0 */ + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs))) + ABORT("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) + /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + pdgstrs(options, n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (dSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + LUstruct->Glu_persist, SOLVEstruct1); + if (getenv("SUPERLU_ACC_SOLVE")){ + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + int* supernodeMask = int32Malloc_dist(nsupers); + for(int ii=0; ii 1) + { + pdgstrs_delete_device_lsum_x(SOLVEstruct1); + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + } + } + +if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ + { + /* Permute the solution matrix B <= Pc'*X. */ + pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + // SUPERLU_FREE (b_work); + } + if (grid3d->zscp.Iam == 0 || Solve3D) + SUPERLU_FREE (X); + + } /* end if nrhs > 0 and factor successful */ + +#if ( PRNTlevel>=1 ) + if (!grid3d->iam) { + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + } +#endif + + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif + + } /* process layer 0 done solve */ + + /* Scatter the solution from 2D grid-0 to 3D grid */ + if (nrhs > 0) + dScatter_B3d(A3d, grid3d); + + B = A3d->B3d; // B is now assigned back to B3d on return + A->Store = Astore3d; // restore Astore to 3D + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Exit pdgssvx3d()"); +#endif +} diff --git a/SRC/pdgssvx3d_2pass_Yang.c b/SRC/pdgssvx3d_2pass_Yang.c new file mode 100755 index 00000000..036bc8f6 --- /dev/null +++ b/SRC/pdgssvx3d_2pass_Yang.c @@ -0,0 +1,2150 @@ + +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Solves a system of linear equations A*X=B using 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.2) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021
+ * Last update: November 8, 2021  v7.2.0
+ */
+#include "superlu_ddefs.h"
+#include "TRF3dV100/superlu_summit.h"
+#include "pddistribute3d.h"
+#include "ssvx3dAux.c"
+int_t dgatherAllFactoredLU3d( dtrf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT );
+#include 
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PDGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (dLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util_dist.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ +// dSOLVEstruct3d_t * SOLVEstruct, +// SOLVEstruct->A3d + +int writeLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct) +{ + + if (getenv("LUFILE")) + { + FILE *fp = fopen(getenv("LUFILE"), "w"); + printf("writing to %s", getenv("LUFILE")); + for (int i = 0; i < nsupers; i++) + { + if (LUstruct->Llu->Lrowind_bc_ptr[i]) + { + int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; + double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(i) * len; + fwrite(nzval, sizeof(double), len2, fp); // assume fp will be incremented + } + + if (LUstruct->Llu->Ufstnz_br_ptr[i]) + { + int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; + double *nzval = LUstruct->Llu->Unzval_br_ptr[i]; + int_t lenv = usub[1]; + + fwrite(nzval, sizeof(double), lenv, fp); // assume fp will be incremented + } + } + + fclose(fp); + } + else + { + printf("Please set environment variable LUFILE to write\n..bye bye"); + exit(0); + } + + return 0; +} + +#define EPSILON 1e-3 + +static int checkArr(double *A, double *B, int n) +{ + for (int i = 0; i < n; i++) + { + assert(fabs(A[i] - B[i]) <= EPSILON * SUPERLU_MIN(fabs(A[i]), fabs(B[i]))); + } + + return 0; +} + +int checkLUFromDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct) +{ + dLocalLU_t *Llu = LUstruct->Llu; + + double *Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]); + double *Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]); + + if (getenv("LUFILE")) + { + FILE *fp = fopen(getenv("LUFILE"), "r"); + printf("reading from %s", getenv("LUFILE")); + for (int i = 0; i < nsupers; i++) + { + if (LUstruct->Llu->Lrowind_bc_ptr[i]) + { + int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; + double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(i) * len; + fread(Lval_buf, sizeof(double), len2, fp); // assume fp will be incremented + checkArr(nzval, Lval_buf, len2); + } + + if (LUstruct->Llu->Ufstnz_br_ptr[i]) + { + int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; + double *nzval = LUstruct->Llu->Unzval_br_ptr[i]; + int_t lenv = usub[1]; + + fread(Uval_buf, sizeof(double), lenv, fp); // assume fp will be incremented + checkArr(nzval, Uval_buf, lenv); + } + } + printf("CHecking LU from %s is succesful ", getenv("LUFILE")); + fclose(fp); + } + else + { + printf("Please set environment variable LUFILE to read\n..bye bye"); + exit(0); + } + + return 0; +} + + +/*! \brief Dump the factored matrix L using matlab triple-let format + */ +void dDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d, + Glu_persist_t *Glu_persist, dLocalLU_t *Llu) +{ + register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; + int k, mycol, r, n, nmax; + int_t nnzL; + int_t *xsup = Glu_persist->xsup; + int_t *index; + double *nzval; + char filename[256]; + FILE *fp, *fopen(); + gridinfo_t *grid = &(grid3d->grid2d); + int iam = grid->iam; + int iam3d = grid3d->iam; + + // assert(grid->npcol*grid->nprow==1); + + // count nonzeros in the first pass + nnzL = 0; + n = 0; + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; i=xsup[gb]+j+1){ + nnzL ++; + nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1); + n = nmax; + } + + } + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + } + MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm); + MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm); + + snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d); + printf("Dumping L factor to --> %s\n", filename); + if ( !(fp = fopen(filename, "w")) ) { + ABORT("File open failed"); + } + + if(grid->iam==0){ + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); + } + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; iStore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + double *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable = NULL; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + double *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + double *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; + float flinfo; /* track memory usage of parallel symbolic factorization */ + bool Solve3D = true; + int_t nsupers; +#if (PRNTlevel >= 2) + double dmin, dsum, dprod; +#endif + + dtrf3Dpartition_t *trf3Dpartition=LUstruct->trf3Dpart; + int gpu3dVersion = 0; + #ifdef GPU_ACC + // gpu3dVersion = 1; + if (getenv("GPU3DVERSION")) + { + gpu3dVersion = atoi(getenv("GPU3DVERSION")); + } + + LUgpu_Handle LUgpu; + #endif + + + LUstruct->dt = 'd'; + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + validateInput_ssvx3d(options, A, ldb, nrhs, grid3d, info); + + /* Initialization. */ + + options->Algo3d = YES; + + /* definition of factored seen by each process layer */ + factored = (Fact == FACTORED); + + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d, + so that the names {ldb, B, and Astore} can be used internally. + B3d and Astore3d will be assigned back to B and Astore on return.*/ + int ldb3d = ldb; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + NRformat_loc3d *A3d = SOLVEstruct->A3d; + + /* B3d is aliased to B; + B2d is allocated; + B is then aliased to B2d for the following 2D solve; + */ + dGatherNRformat_loc3d_allgrid(Fact, (NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d, &A3d); + + B = (double *)A3d->B2d; /* B is now pointing to B2d, + allocated in dGatherNRformat_loc3d. */ + // PrintDouble5("after gather B=B2d", ldb, B); + + SOLVEstruct->A3d = A3d; /* This structure need to be persistent across + multiple calls of pdgssvx3d() */ + + NRformat_loc *Astore0 = A3d->A_nfmt; // on all grids + NRformat_loc *A_orig = A->Store; + ////// + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Enter pdgssvx3d()"); +#endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: + - equilibration, + - ordering, + - symbolic factorization, + - distribution of L & U */ + + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); + + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; // on all grids + ldb = Astore0->m_loc; + + /* The following code now works on all grids */ + Astore = (NRformat_loc *)A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (double *)Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + /* Extract equilibration status from a previous factorization */ + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + { + rowequ = colequ = FALSE; + } + + /* Not factored & ask for equilibration, then alloc RC */ + if (Equil && Fact != SamePattern_SameRowPerm) + dallocScalePermstruct_RC(ScalePermstruct, m, n); + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) + { + scaleMatrixDiagonally(Fact, ScalePermstruct, + A, stat, grid, &rowequ, &colequ, &iinfo); + if (iinfo < 0) + return; // return if error + + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if (!factored) + { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) + { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); + + GAstore = (NCformat *)GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof(int_t); + + if (need_value) + { + a_GA = (double *)GAstore->nzval; + GA_mem_use += nnz * sizeof(double); + } + + else + assert(GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + perform_row_permutation( + options, Fact, ScalePermstruct, LUstruct, + m, n, grid, A, &GA, stat, job, Equil, + &rowequ, &colequ, &iinfo); + + } /* end if (!factored) */ + + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (!factored || options->IterRefine) + anorm = computeA_Norm(notran, A, grid); + + /* ------------------------------------------------------------ + Perform ordering and symbolic factorization + ------------------------------------------------------------ */ + if (!factored) + { + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) + { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int)(pow(2, ((int)LOG2(nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split(grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) + { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist(2 * noDomains))) + ABORT("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist(2 * noDomains))) + ABORT("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) + { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } + else if (permc_spec != PARMETIS) + { + /* same as before */ + printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int)MYROW(grid->iam, grid), (int)MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + + if (permc_spec != MY_PERMC && Fact == DOFACT) + { + if (permc_spec == PARMETIS) + { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT("ERROR in get perm_c parmetis."); + } + else + { + get_perm_c_dist(iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_() - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) + { + if (parSymbFact == NO) + { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder(options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *)GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) + { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) + { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if (PRNTlevel >= 1) + if (!iam) + printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2, options), sp_ienv_dist(3, options), sp_ienv_dist(6, options)); +#endif + t = SuperLU_timer_(); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) + ABORT("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact(options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if (iinfo < 0) + { + /* Successful return */ + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if (PRNTlevel >= 1) + if (!iam) + { + printf("\tNo of supers %ld\n", + (long)Glu_persist->supno[n - 1] + 1); + printf("\tSize of G(L) %ld\n", (long)Glu_freeable->xlsub[n]); + printf("\tSize of G(U) %ld\n", (long)Glu_freeable->xusub[n]); + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } + else + { + if (!iam) + { + fprintf(stderr, "symbfact() error returns %d\n", + (int)iinfo); + exit(-1); + } + } + + } /* end serial symbolic factorization */ + else + { /* parallel symbolic factorization */ + t = SuperLU_timer_(); + flinfo = + symbfact_dist(options, nprocs_num, noDomains, + A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if (flinfo > 0) + ABORT("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist(&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist(&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + +#if (DEBUGlevel >= 2) // Sherry + if (!iam) + PrintInt10("perm_c", m, perm_c); +#endif + if (sizes) + SUPERLU_FREE(sizes); + if (fstVtxSep) + SUPERLU_FREE(fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free(&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) + { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_(); + + nsupers = getNsupers(n, LUstruct->Glu_persist); + int* supernodeMask; + if(Fact == SamePattern_SameRowPerm){ + supernodeMask=trf3Dpartition->supernodeMask; + dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid, supernodeMask); + + }else{ + + // First call of pddistribute_allgrid with a prefixed supernodeMask + // YL: this first call can be removed with Piyush's cleaner fix + int* supernodeMask = int32Calloc_dist(nsupers); + for (int i=0;izscp.Iam == i%grid3d->npdep) + supernodeMask[i]=1; + } + dist_mem_use = pddistribute_allgrid_index_only(options, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid, supernodeMask); + SUPERLU_FREE(supernodeMask); + + // Generate the 3D partition + dDestroy_trf3Dpartition(LUstruct->trf3Dpart); + trf3Dpartition = dinitTrf3Dpartition_allgrid(n, options, LUstruct, grid3d); + LUstruct->trf3Dpart=trf3Dpartition; + + // Delete the meta data generated by pddistribute_allgrid + dLocalLU_t *Llu = LUstruct->Llu; + for (int jb = 0; jb < CEILING( nsupers, grid->npcol ); ++jb) { /* for each block column ... */ + if ( Llu->Lrowind_bc_ptr[jb] ) { + SUPERLU_FREE (Llu->Lrowind_bc_ptr[jb]); + } + } + SUPERLU_FREE (Llu->Lrowind_bc_ptr); + for (int lb = 0; lb < CEILING( nsupers, grid->nprow ); ++lb) { /* for each block row ... */ + if(Llu->Ufstnz_br_ptr[lb]!=NULL) + SUPERLU_FREE(Llu->Ufstnz_br_ptr[lb]); + } + SUPERLU_FREE(Llu->Ufstnz_br_ptr); + + + // Second call of pddistribute_allgrid with the final supernodeMask + dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid, trf3Dpartition->supernodeMask); + + + /* now that LU structure has been scattered, initialize the LU and buffers */ + dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs, + trf3Dpartition->sForests, LUstruct, grid3d); + dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t)); + dLluBufInit(LUvsb, LUstruct); + trf3Dpartition->LUvsb = LUvsb; + } + + + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + + } + else + { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_(); + dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_() - t; + + ABORT("ddist_psymbtonum does not yet work with 3D factorization\n"); + + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + + + /* Perform numerical factorization in parallel on all process layers.*/ + + /* nvshmem related. The nvshmem_malloc has to be called before trs_compute_communication_structure, otherwise solve is much slower*/ + #ifdef HAVE_NVSHMEM + int nc = CEILING( nsupers, grid->npcol); + int nr = CEILING( nsupers, grid->nprow); + int flag_bc_size = RDMA_FLAG_SIZE * (nc+1); + int flag_rd_size = RDMA_FLAG_SIZE * nr * 2; + int my_flag_bc_size = RDMA_FLAG_SIZE * (nc+1); + int my_flag_rd_size = RDMA_FLAG_SIZE * nr * 2; + int maxrecvsz = sp_ienv_dist(3, options)* nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + int ready_x_size = maxrecvsz*nc; + int ready_lsum_size = 2*maxrecvsz*nr; + if (getenv("SUPERLU_ACC_SOLVE")){ + nv_init_wrapper(grid->comm); + prepare_multiGPU_buffers(flag_bc_size,flag_rd_size,ready_x_size,ready_lsum_size,my_flag_bc_size,my_flag_rd_size); + } + #endif + + + + + + SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if (PRNTlevel >= 1) + if (grid3d->iam == 0) + { + printf("after 3D initialization.\n"); + fflush(stdout); + } +#endif + + + + + + t = SuperLU_timer_(); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + // get environment variable TRF3DVERSION +#ifdef GPU_ACC + if (gpu3dVersion == 1) + { /* this is the new C++ code in TRF3dV100/ directory */ + + if (!grid3d->iam) + printf("Using pdgstrf3d+gpu version 1 for Summit\n"); +#if 0 + pdgstrf3d_summit(options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); +#else + int_t ldt = sp_ienv_dist(3, options); /* Size of maximum supernode */ + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + /* call constructor in C++ code */ + LUgpu = createLUgpuHandle(nsupers, ldt, trf3Dpartition, LUstruct, grid3d, + SCT, options, stat, thresh, info); + + /* call pdgstrf3d() in C++ code */ + pdgstrf3d_LUpackedInterface(LUgpu); + + copyLUGPU2Host(LUgpu, LUstruct); + destroyLUgpuHandle(LUgpu); + + // print other stuff + // if (!grid3d->zscp.Iam) + // SCT_printSummary(grid, SCT); + reduceStat(FACT, stat, grid3d); + +#endif + } + else /* this is the old C code, with less GPU offload */ +#endif /* matching ifdef GPU_ACC */ + { + + pdgstrf3d(options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + + // dDumpLblocks3D(nsupers, grid3d, LUstruct->Glu_persist, LUstruct->Llu); + + + } + if (getenv("NEW3DSOLVE")){ + dbroadcastAncestor3d(trf3Dpartition, LUstruct, grid3d, SCT); + } + + if ( options->Fact != SamePattern_SameRowPerm) { + if (getenv("NEW3DSOLVE") && Solve3D==true){ + trs_compute_communication_structure(options, n, LUstruct, + ScalePermstruct, trf3Dpartition->supernodeMask, grid, stat); + }else{ + int* supernodeMask = int32Malloc_dist(nsupers); + for(int ii=0; iiutime[FACT] = SuperLU_timer_() - t; + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + double tgather = SuperLU_timer_(); + if(Solve3D==false){ + dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + } + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + + // Write LU to file + int writeLU = 0; + if (getenv("WRITELU")) + { + writeLU = atoi(getenv("WRITELU")); + } + + if (writeLU) + { + if (!grid3d->zscp.Iam) + writeLUtoDisk(nsupers, Glu_persist->xsup, LUstruct); + } + + int checkLU = 0; + if (getenv("CHECKLU")) + { + checkLU = atoi(getenv("CHECKLU")); + } + + if (checkLU) + { + if (!grid3d->zscp.Iam) + checkLUFromDisk(nsupers, Glu_persist->xsup, LUstruct); + } + +#if (PRNTlevel >= 0) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + d3D_printMemUse(trf3Dpartition, LUstruct, grid3d); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + SCT_free(SCT); + + } /* end if not Factored ... factor on all process layers */ + + if (grid3d->zscp.Iam == 0 ) + { // only process layer 0 + if (!factored) + { + if (options->PrintStat) + { + int_t TinyPivots; + float for_lu, total, avg, loc_max; + float mem_stage[3]; + struct { float val; int rank; } local_struct, global_struct; + + MPI_Reduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, 0, grid->comm ); + stat->TinyPivots = TinyPivots; + + /*-- Compute high watermark of all stages --*/ + if (parSymbFact == TRUE) + { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical + factorization */ + mem_stage[0] = (-flinfo); /* symbfact step */ + mem_stage[1] = (-dist_mem_use); /* distribution step */ + loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1]); + if (options->RowPerm != NO ) + loc_max = SUPERLU_MAX(loc_max, GA_mem_use); + } + else + { + mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */ + mem_stage[1] = symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu; /* distribution step */ + loc_max = SUPERLU_MAX(mem_stage[0], mem_stage[1] ); + } + + dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + mem_stage[2] = num_mem_usage.total; /* numerical factorization step */ + + loc_max = SUPERLU_MAX(loc_max, mem_stage[2] ); /* local max of 3 stages */ + + local_struct.val = loc_max; + local_struct.rank = grid->iam; + MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm ); + int all_highmark_rank = global_struct.rank; + float all_highmark_mem = global_struct.val * 1e-6; + + MPI_Reduce( &loc_max, &avg, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + + /*-- Compute memory usage of numerical factorization --*/ + local_struct.val = num_mem_usage.for_lu; + MPI_Reduce(&local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm); + int lu_max_rank = global_struct.rank; + float lu_max_mem = global_struct.val * 1e-6; + + local_struct.val = stat->peak_buffer; + MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm ); + int buffer_peak_rank = global_struct.rank; + float buffer_peak = global_struct.val*1e-6; + if (iam == 0) + { + printf("\n** Memory Usage **********************************\n"); + printf("** Total highmark (MB):\n" + " Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, + all_highmark_mem); + printf(" Max at rank %d, different stages (MB):\n" + "\t. symbfact %8.2f\n" + "\t. distribution %8.2f\n" + "\t. numfact %8.2f\n", + all_highmark_rank, mem_stage[0] * 1e-6, mem_stage[1] * 1e-6, mem_stage[2] * 1e-6); + printf("** NUMfact space (MB): (sum-of-all-processes)\n" + " L\\U : %8.2f | Total : %8.2f\n", + for_lu * 1e-6, total * 1e-6); + printf("\t. max at rank %d, max L+U memory (MB): %8.2f\n" + "\t. max at rank %d, peak buffer (MB): %8.2f\n", + lu_max_rank, lu_max_mem, + buffer_peak_rank, buffer_peak); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); + fflush(stdout); + } + } /* end printing stats */ + + } /* end if not Factored */ + } + + if(Solve3D){ + + if ( options->Fact == DOFACT || options->Fact == SamePattern ) { + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ + dSolveFinalize(options, SOLVEstruct); /* Clean up structure */ + pdgstrs_delete_device_lsum_x(SOLVEstruct); + options->SolveInitialized = NO; /* Reset the solve state */ + } + } + + if (getenv("NEW3DSOLVE")){ + + + if (options->DiagInv == YES && (Fact != FACTORED)) + { + pdCompute_Diag_Inv(n, LUstruct, grid, stat, info); + + // The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d. + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); +#if (defined(GPU_ACC) && defined(GPU_SOLVE)) + + pdconvertU(options, grid, LUstruct, stat, n); + + // checkGPU(gpuFree(LUstruct->Llu->d_xsup)); + // checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked)); + // checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_ilsum)); + // checkGPU(gpuFree(LUstruct->Llu->d_grid)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat)); + + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice)); +#endif +if (getenv("SUPERLU_ACC_SOLVE")){ +#ifdef GPU_ACC + checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat, + (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat, + (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat, + (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); +#endif +} + } + } + }else{ /* if(Solve3D) */ + + if (grid3d->zscp.Iam == 0){ /* on 2D grid-0 */ + + if ( options->Fact == DOFACT || options->Fact == SamePattern ) { + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ + dSolveFinalize(options, SOLVEstruct); /* Clean up structure */ + pdgstrs_delete_device_lsum_x(SOLVEstruct); + options->SolveInitialized = NO; /* Reset the solve state */ + } + } + +#if (defined(GPU_ACC) && defined(GPU_SOLVE)) + if (options->DiagInv == NO) + { + if (iam == 0) + { + printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n"); + printf(" otherwise, use CPU trisolve\n"); + fflush(stdout); + } + // exit(0); // Sherry: need to return an error flag + } +#endif + + if (options->DiagInv == YES && (Fact != FACTORED)) + { + pdCompute_Diag_Inv(n, LUstruct, grid, stat, info); + + // The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d. + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); +#ifdef GPU_ACC + + pdconvertU(options, grid, LUstruct, stat, n); + + // checkGPU(gpuFree(LUstruct->Llu->d_xsup)); + // checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked)); + // checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset)); + // checkGPU(gpuFree(LUstruct->Llu->d_ilsum)); + // checkGPU(gpuFree(LUstruct->Llu->d_grid)); + // checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat)); + // checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat)); + + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice)); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double))); + // checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t))); + // checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice)); +#endif + +if (getenv("SUPERLU_ACC_SOLVE")){ +#ifdef GPU_ACC + + checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat, + (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat, + (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); + checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat, + (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice)); +#endif +} + } + } + } + + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if ((nrhs > 0) && (*info == 0)) + { + if (options->SolveInitialized == NO){ + if (getenv("SUPERLU_ACC_SOLVE")){ + if (getenv("NEW3DSOLVE") && Solve3D==true){ + pdgstrs_init_device_lsum_x(options, n, m_loc, nrhs, grid,LUstruct, SOLVEstruct,trf3Dpartition->supernodeMask); + }else{ + int* supernodeMask = int32Malloc_dist(nsupers); + for(int ii=0; iiutime[SOLVE] = 0.0; + if(Solve3D){ + + // if (!(b_work = doubleMalloc_dist(n))) + // ABORT("Malloc fails for b_work[]"); + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs))) + ABORT("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + + if (options->SolveInitialized == NO) + /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + if (getenv("NEW3DSOLVE")){ + pdgstrs3d_newsolve (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X, + m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info); + }else{ + pdgstrs3d (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X, + m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info); + } + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (dSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + if (getenv("SUPERLU_ACC_SOLVE")){ + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,trf3Dpartition->supernodeMask); + } + } + + pdgsrfs3d (options, n, A, anorm, LUstruct, ScalePermstruct, grid3d, trf3Dpartition, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pdgstrs_delete_device_lsum_x(SOLVEstruct1); + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + }else{ + + if (grid3d->zscp.Iam == 0){ /* on 2D grid-0 */ + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs))) + ABORT("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) + /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + pdgstrs(options, n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (dSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + if (getenv("SUPERLU_ACC_SOLVE")){ + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + int* supernodeMask = int32Malloc_dist(nsupers); + for(int ii=0; ii 1) + { + pdgstrs_delete_device_lsum_x(SOLVEstruct1); + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } /* end IterRefine */ + } + } + +if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ + { + /* Permute the solution matrix B <= Pc'*X. */ + pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + // SUPERLU_FREE (b_work); + } + if (grid3d->zscp.Iam == 0 || Solve3D) + SUPERLU_FREE (X); + + } /* end if nrhs > 0 and factor successful */ + +#if ( PRNTlevel>=1 ) + if (!grid3d->iam) { + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + } +#endif + + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif + + } /* process layer 0 done solve */ + + /* Scatter the solution from 2D grid-0 to 3D grid */ + if (nrhs > 0) + dScatter_B3d(A3d, grid3d); + + B = A3d->B3d; // B is now assigned back to B3d on return + A->Store = Astore3d; // restore Astore to 3D + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Exit pdgssvx3d()"); +#endif +} diff --git a/SRC/pdutil.c b/SRC/pdutil.c index d3094d31..09413734 100755 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -1258,6 +1258,37 @@ void dSolveFinalize(superlu_dist_options_t *options, dSOLVEstruct_t *SOLVEstruct } } /* dSolveFinalize */ +#if 0 +void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d) +{ + /* free A2d and B2d, which are allocated only in 2D layer grid-0 */ + NRformat_loc3d *A3d = SOLVEstruct->A3d; + NRformat_loc *A2d = A3d->A_nfmt; + if (grid3d->zscp.Iam == 0) + { + SUPERLU_FREE(A2d->rowptr); + SUPERLU_FREE(A2d->colind); + SUPERLU_FREE(A2d->nzval); + } + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts + SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); + int rankorder = grid3d->rankorder; + if (rankorder == 0) + { /* Z-major in 3D grid */ + SUPERLU_FREE(A3d->procs_to_send_list); + SUPERLU_FREE(A3d->send_count_list); + SUPERLU_FREE(A3d->procs_recv_from_list); + SUPERLU_FREE(A3d->recv_count_list); + } + SUPERLU_FREE(A2d); // free 2D structure + SUPERLU_FREE(A3d); // free 3D structure +} /* dDestroy_A3d_gathered_on_2d */ + +#else void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d) { /* free A2d and B2d, which are allocated on all 2D layers*/ @@ -1283,7 +1314,9 @@ void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid } SUPERLU_FREE( A2d ); // free 2D structure SUPERLU_FREE( A3d ); // free 3D structure -} /* dDestroy_A3d_gathered_on_2d */ +} /* dDestroy_A3d_gathered_on_2d_allgrid */ +#endif + /*! \brief Check the inf-norm of the error vector diff --git a/SRC/ssvx3dAux.c b/SRC/ssvx3dAux.c index b2dcb88f..c9142d91 100644 --- a/SRC/ssvx3dAux.c +++ b/SRC/ssvx3dAux.c @@ -243,7 +243,9 @@ void findRowPerm_MC64(gridinfo_t* grid, int_t job, double* R1, double* C1, int_t* iinfo) { + #if ( DEBUGlevel>=1 ) LOG_FUNC_ENTER(); + #endif // Check input parameters if (colptr == NULL || rowind == NULL || a_GA == NULL || perm_r == NULL ) { @@ -297,7 +299,9 @@ void scale_distributed_matrix(int_t rowequ, int_t colequ, int_t m, int_t n, int_t m_loc, int_t *rowptr, int_t *colind, int_t fst_row, double *a, double *R, double *C, double *R1, double *C1) { - printf("\033[1;32mEntering function scale_distributed_matrix at %s:%d\033[0m\n", __FILE__, __LINE__); + #if ( DEBUGlevel>=1 ) + LOG_FUNC_ENTER(); + #endif // Scale the row and column factors for (int i = 0; i < n; ++i) { R1[i] = exp(R1[i]); @@ -503,7 +507,9 @@ void perform_row_permutation( int_t *colequ, int_t *iinfo) { + #if ( DEBUGlevel>=1 ) LOG_FUNC_ENTER(); + #endif int_t *perm_r = ScalePermstruct->perm_r; /* Get NC format data from SuperMatrix GA */ NCformat* GAstore = (NCformat *)GA->Store; @@ -648,7 +654,9 @@ void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatri superlu_dist_mem_usage_t*symb_mem_usage, gridinfo3d_t* grid3d) { + #if ( DEBUGlevel>=1 ) LOG_FUNC_ENTER(); + #endif SuperMatrix GAC; /* Global A in NCP format */ NCPformat *GACstore; int_t *GACcolbeg, *GACcolend, *GACrowind, irow;