Skip to content

Commit

Permalink
get the new 3D redistribution function to work; needs more complete t…
Browse files Browse the repository at this point in the history
…ests
  • Loading branch information
liuyangzhuan committed Oct 3, 2023
1 parent 67a3777 commit 16f9568
Show file tree
Hide file tree
Showing 8 changed files with 5,718 additions and 1,640 deletions.
135 changes: 130 additions & 5 deletions SRC/d3DPartition.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ SupernodeToGridMap_t* createSuperGridMap(int_t nsuper,int_t maxLvl, int_t *myTre
}
void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
{

gridinfo_t* grid = &(grid3d->grid2d);
int iam = grid3d->iam;
#if ( DEBUGlevel>=1 )
CHECK_MALLOC (iam, "Enter newTrfPartitionInit()");
#endif

// check parameters
if (LUstruct->trf3Dpart == NULL || grid3d == NULL)
{
Expand All @@ -109,6 +116,13 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr
// Conversion of supernodal etree to list
treeList_t *treeList = setree2list(nsupers, setree);

// YL: The essential difference between this function and dinitTrf3Dpartition_allgrid to avoid calling pddistribute* twice is that Piyush has removed the treelist weight update function below (and iperm_c_supno as well), which requires the LU data structure
#if 0
/*update treelist with weight and depth*/
getSCUweight_allgrid(nsupers, treeList, xsup,
LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
grid3d);
#endif
// Calculation of tree weight
calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);

Expand All @@ -130,17 +144,107 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr
// sForests, LUstruct, grid3d);
int_t *myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
int_t **treePerm = getTreePermFr(myTreeIdxs, sForests, grid3d);
int* supernodeMask = SUPERLU_MALLOC(nsupers*sizeof(int));
for (int ii = 0; ii < nsupers; ++ii)
supernodeMask[ii]=0;
for (int lvl = 0; lvl < maxLvl; ++lvl)
{
// printf("iam %5d lvl %5d myNodeCount[lvl] %5d\n",grid3d->iam, lvl,myNodeCount[lvl]);
for (int nd = 0; nd < myNodeCount[lvl]; ++nd)
{
supernodeMask[treePerm[lvl][nd]]=1;
}
}





// dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
// dLluBufInit(LUvsb, LUstruct);


#if (DEBUGlevel>=1)
// let count sum of gnodecount
int_t gNodeCountSum = 0;
for (int_t i = 0; i < (1 << maxLvl) - 1; ++i)
{
gNodeCountSum += gNodeCount[i];
}
printf(" Iam: %d, Nsupers %d, gnodecountSum =%d \n", grid3d->iam, nsupers, gNodeCountSum);
#endif

/* Sherry 2/17/23
Compute buffer sizes needed for diagonal LU blocks and C matrices in GEMM. */


iam = grid->iam; /* 'grid' is 2D grid */
int k, k0, k_st, k_end, offset, nsupc, krow, kcol;
int myrow = MYROW (iam, grid);
int mycol = MYCOL (iam, grid);
int_t *xsup = LUstruct->Glu_persist->xsup;

#if 0
int krow = PROW (k, grid);
int kcol = PCOL (k, grid);
int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;

int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
double** Unzval_br_ptr = Llu->Unzval_br_ptr;
#endif

int mxLeafNode = 0; // Yang: only need to check the leaf level of topoInfo as the factorization proceeds level by level
for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
}

// Yang: use ldts to track the maximum needed buffer sizes per node of topoInfo
//int *ldts = (int*) SUPERLU_MALLOC(mxLeafNode*sizeof(int));
//for (int i = 0; i < mxLeafNode; ++i) { //????????
//ldts[i]=1;
//}
int *ldts = int32Calloc_dist(mxLeafNode);

for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { /* Loop through the Pz tree levels */
int treeId = myTreeIdxs[ilvl];
sForest_t* sforest = sForests[treeId];
if (sforest){
int_t *perm_node = sforest->nodeList ; /* permuted list, in order of factorization */
int maxTopoLevel = sforest->topoInfo.numLvl;/* number of levels at each outer-tree node */
for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
{
/* code */
k_st = sforest->topoInfo.eTreeTopLims[topoLvl];
k_end = sforest->topoInfo.eTreeTopLims[topoLvl + 1];
//printf("\t..topoLvl %d, k_st %d, k_end %d\n", topoLvl, k_st, k_end);

for (int k0 = k_st; k0 < k_end; ++k0)
{
offset = k0 - k_st;
k = perm_node[k0];
nsupc = (xsup[k+1]-xsup[k]);
krow = PROW (k, grid);
kcol = PCOL (k, grid);
if ( myrow == krow || mycol == kcol ) /* diagonal process */
{
ldts[offset] = SUPERLU_MAX(ldts[offset], nsupc);
}
#if 0 /* GPU gemm buffers can only be set on GPU side, because here we only know
the size of U data structure on CPU. It is different on GPU */
if ( mycol == kcol ) { /* processes owning L panel */

}
if ( myrow == krow )
gemmCsizes[offset] = SUPERLU_MAX(ldts[offset], ???);
#endif
}
}
}
}




trf3Dpart->gEtreeInfo = fillEtreeInfo(nsupers, setree, treeList);
// trf3Dpart->iperm_c_supno = iperm_c_supno;
Expand All @@ -149,11 +253,22 @@ void newTrfPartitionInit(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *gr
trf3Dpart->myZeroTrIdxs = myZeroTrIdxs;
trf3Dpart->sForests = sForests;
trf3Dpart->treePerm = treePerm;
trf3Dpart->maxLvl = maxLvl;
// trf3Dpart->LUvsb = LUvsb;
trf3Dpart->supernode2treeMap = createSupernode2TreeMap(nsupers, maxLvl, gNodeCount, gNodeLists);
trf3Dpart->superGridMap = createSuperGridMap(nsupers, maxLvl, myTreeIdxs, myZeroTrIdxs, gNodeCount, gNodeLists);


trf3Dpart->supernodeMask = supernodeMask;
trf3Dpart->mxLeafNode = mxLeafNode; // Sherry added these 3
trf3Dpart->diagDims = ldts;
//trf3Dpart->gemmCsizes = gemmCsizes;
// Sherry added
// Deallocate storage
SUPERLU_FREE(gNodeCount);
SUPERLU_FREE(gNodeLists);
free_treelist(nsupers, treeList);
#if ( DEBUGlevel>=1 )
CHECK_MALLOC (iam, "Exit newTrfPartitionInit()");
#endif

}

Expand Down Expand Up @@ -326,18 +441,28 @@ void bcastPermutedSparseA(SuperMatrix *A,
// beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
// } NRformat_loc;


// NRformat_loc *Astore = (NRformat_loc *) A->Store;
MPI_Bcast(&(Astore->nnz_loc), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
MPI_Bcast(&(Astore->m_loc), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
MPI_Bcast(&(Astore->fst_row), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);


/***** YL: remove the allocation in the following as dGatherNRformat_loc3d_allgrid instead of dGatherNRformat_loc3d has been called, which already allocate A->Store on all grids
* Note the the broadcast is still needed as the A->Store has been scaled by scaleMatrixDiagonally only on grid 0
*/
#if 1
MPI_Bcast(Astore->nzval, Astore->nnz_loc*sizeof(double), MPI_BYTE, 0, grid3d->zscp.comm);
MPI_Bcast(Astore->rowptr, (Astore->m_loc+1)*sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
MPI_Bcast(Astore->colind, Astore->nnz_loc*sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
#else
allocBcastArray( &(Astore->nzval), Astore->nnz_loc*sizeof(double),
0, grid3d->zscp.comm);
allocBcastArray( &(Astore->rowptr), (Astore->m_loc+1)*sizeof(int_t),
0, grid3d->zscp.comm);
allocBcastArray( &(Astore->colind), Astore->nnz_loc*sizeof(int_t),
0, grid3d->zscp.comm);

#endif

}

Expand Down
4 changes: 3 additions & 1 deletion SRC/pddistribute-aux3d.c
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,8 @@ int_t checkDist3DLUStruct( dLUstruct_t* LUstruct, gridinfo3d_t* grid3d)
}
}
}
printf("Check 3D LU structure passed\n");
#if ( DEBUGlevel>=1 )
CHECK_MALLOC (grid3d->iam, "Exit checkDist3DLUStruct()");
#endif
return 0;
}
Loading

0 comments on commit 16f9568

Please sign in to comment.