From 16f9568640b3a8615c16f17a8610cc62ed0ef936 Mon Sep 17 00:00:00 2001
From: Yang Liu <liuyangzhuan@gmail.com>
Date: Mon, 2 Oct 2023 17:10:07 -0700
Subject: [PATCH] get the new 3D redistribution function to work; needs more
 complete tests

---
 SRC/d3DPartition.c         |  135 ++-
 SRC/pddistribute-aux3d.c   |    4 +-
 SRC/pddistribute3d.c       | 2187 +++++++++++++++++++-----------------
 SRC/pdgssvx3d.c            |  721 +++---------
 SRC/pdgssvx3d_1pass_Yang.c | 2116 ++++++++++++++++++++++++++++++++++
 SRC/pdgssvx3d_2pass_Yang.c | 2150 +++++++++++++++++++++++++++++++++++
 SRC/pdutil.c               |   35 +-
 SRC/ssvx3dAux.c            |   10 +-
 8 files changed, 5718 insertions(+), 1640 deletions(-)
 create mode 100755 SRC/pdgssvx3d_1pass_Yang.c
 create mode 100755 SRC/pdgssvx3d_2pass_Yang.c

diff --git a/SRC/d3DPartition.c b/SRC/d3DPartition.c
index 403b3ee5..d15f2c95 100644
--- a/SRC/d3DPartition.c
+++ b/SRC/d3DPartition.c
@@ -96,6 +96,13 @@ SupernodeToGridMap_t* createSuperGridMap(int_t nsuper,int_t maxLvl, int_t *myTre
 }
 void newTrfPartitionInit(int_t nsupers,  dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int iam = grid3d->iam;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter newTrfPartitionInit()");
+#endif
+
     // check parameters
     if (LUstruct->trf3Dpart == NULL || grid3d == NULL)
     {
@@ -109,6 +116,13 @@ void newTrfPartitionInit(int_t nsupers,  dLUstruct_t *LUstruct, gridinfo3d_t *gr
     // Conversion of supernodal etree to list
     treeList_t *treeList = setree2list(nsupers, setree);
 
+// YL: The essential difference between this function and dinitTrf3Dpartition_allgrid to avoid calling pddistribute* twice is that Piyush has removed the treelist weight update function below (and iperm_c_supno as well), which requires the LU data structure  
+#if 0 
+    /*update treelist with weight and depth*/
+    getSCUweight_allgrid(nsupers, treeList, xsup,
+        LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+        grid3d);
+#endif
     // Calculation of tree weight
     calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
 
@@ -130,10 +144,26 @@ void newTrfPartitionInit(int_t nsupers,  dLUstruct_t *LUstruct, gridinfo3d_t *gr
     //                       sForests, LUstruct, grid3d);
     int_t *myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
     int_t **treePerm = getTreePermFr(myTreeIdxs, sForests, grid3d);
+    int* supernodeMask = SUPERLU_MALLOC(nsupers*sizeof(int));
+    for (int ii = 0; ii < nsupers; ++ii)
+        supernodeMask[ii]=0;
+    for (int lvl = 0; lvl < maxLvl; ++lvl)
+    {
+        // printf("iam %5d lvl %5d myNodeCount[lvl] %5d\n",grid3d->iam, lvl,myNodeCount[lvl]);
+        for (int nd = 0; nd < myNodeCount[lvl]; ++nd)
+        {
+            supernodeMask[treePerm[lvl][nd]]=1;
+        }
+    }
+
+
+
+
 
     // dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
     // dLluBufInit(LUvsb, LUstruct);
-    
+
+#if (DEBUGlevel>=1)    
     // let count sum of gnodecount
     int_t gNodeCountSum = 0;
     for (int_t i = 0; i < (1 << maxLvl) - 1; ++i)
@@ -141,6 +171,80 @@ void newTrfPartitionInit(int_t nsupers,  dLUstruct_t *LUstruct, gridinfo3d_t *gr
         gNodeCountSum += gNodeCount[i];
     }
     printf(" Iam: %d, Nsupers %d, gnodecountSum =%d \n", grid3d->iam, nsupers, gNodeCountSum);
+#endif
+
+    /* Sherry 2/17/23
+       Compute buffer sizes needed for diagonal LU blocks and C matrices in GEMM. */
+
+    
+    iam = grid->iam;  /* 'grid' is 2D grid */
+    int k, k0, k_st, k_end, offset, nsupc, krow, kcol;
+    int myrow = MYROW (iam, grid);
+    int mycol = MYCOL (iam, grid);
+    int_t *xsup  = LUstruct->Glu_persist->xsup;
+    
+#if 0    
+    int krow = PROW (k, grid);
+    int kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#endif    
+    
+    int mxLeafNode = 0; // Yang: only need to check the leaf level of topoInfo as the factorization proceeds level by level 
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
+        if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
+            mxLeafNode    = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
+    }
+
+    // Yang: use ldts to track the maximum needed buffer sizes per node of topoInfo 
+    //int *ldts = (int*) SUPERLU_MALLOC(mxLeafNode*sizeof(int));
+    //for (int i = 0; i < mxLeafNode; ++i) {  //????????
+    //ldts[i]=1;
+    //}
+    int *ldts = int32Calloc_dist(mxLeafNode);
+
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {  /* Loop through the Pz tree levels */
+        int treeId = myTreeIdxs[ilvl];
+        sForest_t* sforest = sForests[treeId];
+        if (sforest){
+            int_t *perm_node = sforest->nodeList ; /* permuted list, in order of factorization */
+	    int maxTopoLevel = sforest->topoInfo.numLvl;/* number of levels at each outer-tree node */
+            for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+            {
+                /* code */
+                k_st = sforest->topoInfo.eTreeTopLims[topoLvl];
+                k_end = sforest->topoInfo.eTreeTopLims[topoLvl + 1];
+		//printf("\t..topoLvl %d, k_st %d, k_end %d\n", topoLvl, k_st, k_end);
+		
+                for (int k0 = k_st; k0 < k_end; ++k0)
+                {
+                    offset = k0 - k_st;
+                    k = perm_node[k0];
+                    nsupc = (xsup[k+1]-xsup[k]);
+                    krow = PROW (k, grid);
+                    kcol = PCOL (k, grid);
+                    if ( myrow == krow || mycol == kcol )  /* diagonal process */
+                    {
+		        ldts[offset] = SUPERLU_MAX(ldts[offset], nsupc);
+                    }
+#if 0 /* GPU gemm buffers can only be set on GPU side, because here we only know
+	 the size of U data structure on CPU.  It is different on GPU */
+                    if ( mycol == kcol ) { /* processes owning L panel */
+		      
+		    }
+                    if ( myrow == krow ) 
+			gemmCsizes[offset] = SUPERLU_MAX(ldts[offset], ???);
+#endif		    
+                }               
+            }
+        }
+    }
+
+
+
 
     trf3Dpart->gEtreeInfo = fillEtreeInfo(nsupers, setree, treeList);
     // trf3Dpart->iperm_c_supno = iperm_c_supno;
@@ -149,11 +253,22 @@ void newTrfPartitionInit(int_t nsupers,  dLUstruct_t *LUstruct, gridinfo3d_t *gr
     trf3Dpart->myZeroTrIdxs = myZeroTrIdxs;
     trf3Dpart->sForests = sForests;
     trf3Dpart->treePerm = treePerm;
+    trf3Dpart->maxLvl = maxLvl;
     // trf3Dpart->LUvsb = LUvsb;
     trf3Dpart->supernode2treeMap = createSupernode2TreeMap(nsupers, maxLvl, gNodeCount, gNodeLists);
     trf3Dpart->superGridMap = createSuperGridMap(nsupers, maxLvl, myTreeIdxs, myZeroTrIdxs, gNodeCount, gNodeLists);
-
-
+    trf3Dpart->supernodeMask = supernodeMask;
+    trf3Dpart->mxLeafNode = mxLeafNode;  // Sherry added these 3
+    trf3Dpart->diagDims = ldts;
+    //trf3Dpart->gemmCsizes = gemmCsizes;
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount);
+    SUPERLU_FREE(gNodeLists);
+    free_treelist(nsupers, treeList);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit newTrfPartitionInit()");
+#endif
 
 }
 
@@ -326,18 +441,28 @@ void bcastPermutedSparseA(SuperMatrix *A,
 // 			beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
 // } NRformat_loc;
 
-
+    
     // NRformat_loc *Astore = (NRformat_loc *) A->Store;
     MPI_Bcast(&(Astore->nnz_loc), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
     MPI_Bcast(&(Astore->m_loc), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
     MPI_Bcast(&(Astore->fst_row), sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
+
+
+/***** YL: remove the allocation in the following as dGatherNRformat_loc3d_allgrid instead of dGatherNRformat_loc3d has been called, which already allocate A->Store on all grids
+ * Note the the broadcast is still needed as the A->Store has been scaled by scaleMatrixDiagonally only on grid 0
+*/
+#if 1
+    MPI_Bcast(Astore->nzval, Astore->nnz_loc*sizeof(double), MPI_BYTE, 0, grid3d->zscp.comm);
+    MPI_Bcast(Astore->rowptr, (Astore->m_loc+1)*sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
+    MPI_Bcast(Astore->colind, Astore->nnz_loc*sizeof(int_t), MPI_BYTE, 0, grid3d->zscp.comm);
+#else    
     allocBcastArray( &(Astore->nzval), Astore->nnz_loc*sizeof(double),
         0, grid3d->zscp.comm);
     allocBcastArray( &(Astore->rowptr), (Astore->m_loc+1)*sizeof(int_t), 
         0, grid3d->zscp.comm);
     allocBcastArray( &(Astore->colind), Astore->nnz_loc*sizeof(int_t), 
         0, grid3d->zscp.comm);
-
+#endif
 
 }
 
diff --git a/SRC/pddistribute-aux3d.c b/SRC/pddistribute-aux3d.c
index 9e868960..04ab6f0f 100644
--- a/SRC/pddistribute-aux3d.c
+++ b/SRC/pddistribute-aux3d.c
@@ -504,6 +504,8 @@ int_t checkDist3DLUStruct(  dLUstruct_t* LUstruct, gridinfo3d_t* grid3d)
             }
         }
     }
-    printf("Check 3D LU structure passed\n");
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit checkDist3DLUStruct()");
+#endif
     return 0;
 }
\ No newline at end of file
diff --git a/SRC/pddistribute3d.c b/SRC/pddistribute3d.c
index e1aae876..968d7d8b 100644
--- a/SRC/pddistribute3d.c
+++ b/SRC/pddistribute3d.c
@@ -439,41 +439,19 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
     int *index1;                      /* temporary pointer to array of int */
     double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
     double **Lnzval_bc_ptr;           /* size ceil(NSUPERS/Pc) */
-    double *Lnzval_bc_dat;            /* size: sum of sizes of Lnzval_bc_ptr[lk])  */
-    long int *Lnzval_bc_offset;       /* size ceil(NSUPERS/Pc)                 */
 
     int_t **Lrowind_bc_ptr;          /* size ceil(NSUPERS/Pc) */
-    int_t *Lrowind_bc_dat;           /* size: sum of sizes of Lrowind_bc_ptr[lk])   */
-    long int *Lrowind_bc_offset;     /* size ceil(NSUPERS/Pc)                 */
     int_t **Lindval_loc_bc_ptr;      /* size ceil(NSUPERS/Pc)                 */
-    int_t *Lindval_loc_bc_dat;       /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
-    long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc)                 */
 
     int_t *Unnz;                /* size ceil(NSUPERS/Pc)                 */
     double **Unzval_br_ptr;     /* size ceil(NSUPERS/Pr) */
-    double *Unzval_br_dat;      /* size: sum of sizes of Unzval_br_ptr[lk]) */
-    long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr)    */
-    long int Unzval_br_cnt = 0;
     int_t **Ufstnz_br_ptr;      /* size ceil(NSUPERS/Pr) */
-    int_t *Ufstnz_br_dat;       /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
-    long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr)    */
-    long int Ufstnz_br_cnt = 0;
-
-    C_Tree *LBtree_ptr = NULL; /* size ceil(NSUPERS/Pc) */
-    C_Tree *LRtree_ptr = NULL; /* size ceil(NSUPERS/Pr) */
-    C_Tree *UBtree_ptr = NULL; /* size ceil(NSUPERS/Pc) */
-    C_Tree *URtree_ptr = NULL; /* size ceil(NSUPERS/Pr) */
+
     int msgsize;
 
     int_t *Urbs, *Urbs1;       /* Number of row blocks in each block column of U. */
     Ucb_indptr_t **Ucb_indptr; /* Vertical linked list pointing to Uindex[] */
-    Ucb_indptr_t *Ucb_inddat;
-    long int *Ucb_indoffset;
-    long int Ucb_indcnt = 0;
     int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
-    int_t *Ucb_valdat;
-    long int *Ucb_valoffset;
-    long int Ucb_valcnt = 0;
 
     /*-- Counts to be used in factorization. --*/
     int *ToRecv, *ToSendD, **ToSendR;
@@ -522,11 +500,7 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
     int *frecv, *brecv;
     int_t *lloc;
     double **Linv_bc_ptr;     /* size ceil(NSUPERS/Pc) */
-    double *Linv_bc_dat;      /* size: sum of sizes of Linv_bc_ptr[lk]) */
-    long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc)              */
     double **Uinv_bc_ptr;     /* size ceil(NSUPERS/Pc) */
-    double *Uinv_bc_dat;      /* size: sum of sizes of Uinv_bc_ptr[lk]) */
-    long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc)     */
     double *SeedSTD_BC, *SeedSTD_RD;
     int_t idx_indx, idx_lusup;
     int_t nbrow;
@@ -623,20 +597,8 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
         if (!(Unzval_br_ptr =
                   (double **)SUPERLU_MALLOC(k * sizeof(double *))))
             ABORT("Malloc fails for Unzval_br_ptr[].");
-        if (!(Unzval_br_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
-        }
-        Unzval_br_offset[k - 1] = -1;
         if (!(Ufstnz_br_ptr = (int_t **)SUPERLU_MALLOC(k * sizeof(int_t *))))
             ABORT("Malloc fails for Ufstnz_br_ptr[].");
-        if (!(Ufstnz_br_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
-        }
-        Ufstnz_br_offset[k - 1] = -1;
 
         if (!(ToSendD = SUPERLU_MALLOC(k * sizeof(int))))
             ABORT("Malloc fails for ToSendD[].");
@@ -765,12 +727,8 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
                     if (!(index = intMalloc_dist(len1 + 1)))
                         ABORT("Malloc fails for Uindex[].");
                     Ufstnz_br_ptr[lb] = index;
-                    Ufstnz_br_offset[lb] = len1 + 1;
-                    Ufstnz_br_cnt += Ufstnz_br_offset[lb];
                     if (!(Unzval_br_ptr[lb] = doubleMalloc_dist(len)))
                         ABORT("Malloc fails for Unzval_br_ptr[*][].");
-                    Unzval_br_offset[lb] = len;
-                    Unzval_br_cnt += Unzval_br_offset[lb];
 
                     mybufmax[2] = SUPERLU_MAX(mybufmax[2], len1);
                     mybufmax[3] = SUPERLU_MAX(mybufmax[3], len);
@@ -783,8 +741,6 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
                 {
                     Ufstnz_br_ptr[lb] = NULL;
                     Unzval_br_ptr[lb] = NULL;
-                    Unzval_br_offset[lb] = -1;
-                    Ufstnz_br_offset[lb] = -1;
                 }
                 Urb_length[lb] = 0;         /* Reset block length. */
                 Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
@@ -835,54 +791,24 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
         if (!(Lrowind_bc_ptr = (int_t **)SUPERLU_MALLOC(k * sizeof(int_t *))))
             ABORT("Malloc fails for Lrowind_bc_ptr[].");
         Lrowind_bc_ptr[k - 1] = NULL;
-        if (!(Lrowind_bc_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
-        }
-        Lrowind_bc_offset[k - 1] = -1;
-        if (!(Lnzval_bc_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
-        }
-        Lnzval_bc_offset[k - 1] = -1;
 
         if (!(Lindval_loc_bc_ptr =
                   (int_t **)SUPERLU_MALLOC(k * sizeof(int_t *))))
             ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
         Lindval_loc_bc_ptr[k - 1] = NULL;
-        if (!(Lindval_loc_bc_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
-        }
-        Lindval_loc_bc_offset[k - 1] = -1;
 
         if (!(Linv_bc_ptr =
                   (double **)SUPERLU_MALLOC(k * sizeof(double *))))
         {
             fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
         }
-        if (!(Linv_bc_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
-        }
         if (!(Uinv_bc_ptr =
                   (double **)SUPERLU_MALLOC(k * sizeof(double *))))
         {
             fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
         }
-        if (!(Uinv_bc_offset =
-                  (long int *)SUPERLU_MALLOC(k * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
-        }
         Linv_bc_ptr[k - 1] = NULL;
         Uinv_bc_ptr[k - 1] = NULL;
-        Linv_bc_offset[k - 1] = -1;
-        Uinv_bc_offset[k - 1] = -1;
 
         if (!(Unnz =
                   (int_t *)SUPERLU_MALLOC(k * sizeof(int_t))))
@@ -914,11 +840,6 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
           PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
           THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
           ------------------------------------------------------------*/
-        long int Linv_bc_cnt = 0;
-        long int Uinv_bc_cnt = 0;
-        long int Lrowind_bc_cnt = 0;
-        long int Lnzval_bc_cnt = 0;
-        long int Lindval_loc_bc_cnt = 0;
 
         for (jb = 0; jb < nsupers; ++jb)
         { /* for each block column ... */
@@ -1091,17 +1012,11 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
                         len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
                         if (!(index = intMalloc_dist(len1)))
                             ABORT("Malloc fails for index[]");
-                        Lrowind_bc_offset[ljb] = len1;
-                        Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
 
                         if (!(lusup = (double *)SUPERLU_MALLOC(len * nsupc * sizeof(double))))
                             ABORT("Malloc fails for lusup[]");
-                        Lnzval_bc_offset[ljb] = len * nsupc;
-                        Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
                         if (!(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl * 3)))
                             ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
-                        Lindval_loc_bc_offset[ljb] = nrbl * 3;
-                        Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
 
                         myrow = MYROW(iam, grid);
                         krow = PROW(jb, grid);
@@ -1109,20 +1024,14 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
                         { /* diagonal block */
                             if (!(Linv_bc_ptr[ljb] = (double *)SUPERLU_MALLOC(nsupc * nsupc * sizeof(double))))
                                 ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
-                            Linv_bc_offset[ljb] = nsupc * nsupc;
-                            Linv_bc_cnt += Linv_bc_offset[ljb];
 
                             if (!(Uinv_bc_ptr[ljb] = (double *)SUPERLU_MALLOC(nsupc * nsupc * sizeof(double))))
                                 ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
-                            Uinv_bc_offset[ljb] = nsupc * nsupc;
-                            Uinv_bc_cnt += Uinv_bc_offset[ljb];
                         }
                         else
                         {
                             Linv_bc_ptr[ljb] = NULL;
-                            Linv_bc_offset[ljb] = -1;
                             Uinv_bc_ptr[ljb] = NULL;
-                            Uinv_bc_offset[ljb] = -1;
                         }
 
                         mybufmax[0] = SUPERLU_MAX(mybufmax[0], len1);
@@ -1241,12 +1150,7 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
                         Lrowind_bc_ptr[ljb] = NULL;
                         Lnzval_bc_ptr[ljb] = NULL;
                         Linv_bc_ptr[ljb] = NULL;
-                        Linv_bc_offset[ljb] = -1;
-                        Lrowind_bc_offset[ljb] = -1;
-                        Lindval_loc_bc_offset[ljb] = -1;
-                        Lnzval_bc_offset[ljb] = -1;
                         Uinv_bc_ptr[ljb] = NULL;
-                        Uinv_bc_offset[ljb] = -1;
                         Lindval_loc_bc_ptr[ljb] = NULL;
 
                     } /* if nrbl ... */
@@ -1267,126 +1171,12 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
                     Lrowind_bc_ptr[ljb] = NULL;
                     Lnzval_bc_ptr[ljb] = NULL;
                     Linv_bc_ptr[ljb] = NULL;
-                    Linv_bc_offset[ljb] = -1;
-                    Lrowind_bc_offset[ljb] = -1;
-                    Lindval_loc_bc_offset[ljb] = -1;
-                    Lnzval_bc_offset[ljb] = -1;
                     Uinv_bc_ptr[ljb] = NULL;
-                    Uinv_bc_offset[ljb] = -1;
                     Lindval_loc_bc_ptr[ljb] = NULL;
                 }
             }
         } /* for jb ... */
 
-        Linv_bc_cnt += 1; // safe guard
-        Uinv_bc_cnt += 1;
-        Lrowind_bc_cnt += 1;
-        Lindval_loc_bc_cnt += 1;
-        Lnzval_bc_cnt += 1;
-        if (!(Linv_bc_dat =
-                  (double *)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(double))))
-        {
-            fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
-        }
-        if (!(Uinv_bc_dat =
-                  (double *)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(double))))
-        {
-            fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
-        }
-
-        if (!(Lrowind_bc_dat =
-                  (int_t *)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))))
-        {
-            fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
-        }
-        if (!(Lindval_loc_bc_dat =
-                  (int_t *)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))))
-        {
-            fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
-        }
-        if (!(Lnzval_bc_dat =
-                  (double *)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(double))))
-        {
-            fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
-        }
-
-        /* use contingous memory for Linv_bc_ptr, Uinv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
-        k = CEILING(nsupers, grid->npcol); /* Number of local block columns */
-        Linv_bc_cnt = 0;
-        Uinv_bc_cnt = 0;
-        Lrowind_bc_cnt = 0;
-        Lnzval_bc_cnt = 0;
-        Lindval_loc_bc_cnt = 0;
-        long int tmp_cnt;
-        for (jb = 0; jb < k; ++jb)
-        { /* for each block column ... */
-            if (Linv_bc_ptr[jb] != NULL)
-            {
-                for (jj = 0; jj < Linv_bc_offset[jb]; ++jj)
-                {
-                    Linv_bc_dat[Linv_bc_cnt + jj] = Linv_bc_ptr[jb][jj];
-                }
-                SUPERLU_FREE(Linv_bc_ptr[jb]);
-                Linv_bc_ptr[jb] = &Linv_bc_dat[Linv_bc_cnt];
-                tmp_cnt = Linv_bc_offset[jb];
-                Linv_bc_offset[jb] = Linv_bc_cnt;
-                Linv_bc_cnt += tmp_cnt;
-            }
-
-            if (Uinv_bc_ptr[jb] != NULL)
-            {
-                for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj)
-                {
-                    Uinv_bc_dat[Uinv_bc_cnt + jj] = Uinv_bc_ptr[jb][jj];
-                }
-                SUPERLU_FREE(Uinv_bc_ptr[jb]);
-                Uinv_bc_ptr[jb] = &Uinv_bc_dat[Uinv_bc_cnt];
-                tmp_cnt = Uinv_bc_offset[jb];
-                Uinv_bc_offset[jb] = Uinv_bc_cnt;
-                Uinv_bc_cnt += tmp_cnt;
-            }
-
-            if (Lrowind_bc_ptr[jb] != NULL)
-            {
-                for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj)
-                {
-                    Lrowind_bc_dat[Lrowind_bc_cnt + jj] = Lrowind_bc_ptr[jb][jj];
-                }
-                SUPERLU_FREE(Lrowind_bc_ptr[jb]);
-                Lrowind_bc_ptr[jb] = &Lrowind_bc_dat[Lrowind_bc_cnt];
-                tmp_cnt = Lrowind_bc_offset[jb];
-                Lrowind_bc_offset[jb] = Lrowind_bc_cnt;
-                Lrowind_bc_cnt += tmp_cnt;
-            }
-
-            if (Lnzval_bc_ptr[jb] != NULL)
-            {
-                for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj)
-                {
-                    Lnzval_bc_dat[Lnzval_bc_cnt + jj] = Lnzval_bc_ptr[jb][jj];
-                }
-                SUPERLU_FREE(Lnzval_bc_ptr[jb]);
-                Lnzval_bc_ptr[jb] = &Lnzval_bc_dat[Lnzval_bc_cnt];
-                tmp_cnt = Lnzval_bc_offset[jb];
-                Lnzval_bc_offset[jb] = Lnzval_bc_cnt;
-                Lnzval_bc_cnt += tmp_cnt;
-            }
-
-            if (Lindval_loc_bc_ptr[jb] != NULL)
-            {
-                for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj)
-                {
-                    Lindval_loc_bc_dat[Lindval_loc_bc_cnt + jj] = Lindval_loc_bc_ptr[jb][jj];
-                }
-                SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
-                Lindval_loc_bc_ptr[jb] = &Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
-                tmp_cnt = Lindval_loc_bc_offset[jb];
-                Lindval_loc_bc_offset[jb] = Lindval_loc_bc_cnt;
-                Lindval_loc_bc_cnt += tmp_cnt;
-            }
-        } /* for jb ... */
-
-        /////////////////////////////////////////////////////////////////
 
         /* Set up additional pointers for the index and value arrays of U.
            nub is the number of local block columns. */
@@ -1399,18 +1189,6 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
             ABORT("Malloc fails for Ucb_indptr[]");
         if (!(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))))
             ABORT("Malloc fails for Ucb_valptr[]");
-        if (!(Ucb_valoffset =
-                  (long int *)SUPERLU_MALLOC(nub * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
-        }
-        Ucb_valoffset[nub - 1] = -1;
-        if (!(Ucb_indoffset =
-                  (long int *)SUPERLU_MALLOC(nub * sizeof(long int))))
-        {
-            fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
-        }
-        Ucb_indoffset[nub - 1] = -1;
 
         nlb = CEILING(nsupers, grid->nprow); /* Number of local block rows. */
 
@@ -1440,19 +1218,13 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
             { /* Not an empty block column. */
                 if (!(Ucb_indptr[lb] = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))))
                     ABORT("Malloc fails for Ucb_indptr[lb][]");
-                Ucb_indoffset[lb] = Urbs[lb];
-                Ucb_indcnt += Ucb_indoffset[lb];
                 if (!(Ucb_valptr[lb] = (int_t *)intMalloc_dist(Urbs[lb])))
                     ABORT("Malloc fails for Ucb_valptr[lb][]");
-                Ucb_valoffset[lb] = Urbs[lb];
-                Ucb_valcnt += Ucb_valoffset[lb];
             }
             else
             {
                 Ucb_valptr[lb] = NULL;
-                Ucb_valoffset[lb] = -1;
                 Ucb_indptr[lb] = NULL;
-                Ucb_indoffset[lb] = -1;
             }
         }
         for (lk = 0; lk < nlb; ++lk)
@@ -1503,905 +1275,1196 @@ float pddistribute3d(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
             }
         }
 
-        Unzval_br_cnt += 1; // safe guard
-        Ufstnz_br_cnt += 1;
-        Ucb_valcnt += 1;
-        Ucb_indcnt += 1;
-        if (!(Unzval_br_dat =
-                  (double *)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(double))))
-        {
-            fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
-        }
-        if (!(Ufstnz_br_dat =
-                  (int_t *)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))))
-        {
-            fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
-        }
-        if (!(Ucb_valdat =
-                  (int_t *)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))))
-        {
-            fprintf(stderr, "Malloc fails for Ucb_valdat[].");
-        }
-        if (!(Ucb_inddat =
-                  (Ucb_indptr_t *)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))))
-        {
-            fprintf(stderr, "Malloc fails for Ucb_inddat[].");
-        }
-
-        /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
-        k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
-        Unzval_br_cnt = 0;
-        Ufstnz_br_cnt = 0;
-        for (lb = 0; lb < k; ++lb)
-        { /* for each block row ... */
-            if (Unzval_br_ptr[lb] != NULL)
-            {
-                for (jj = 0; jj < Unzval_br_offset[lb]; ++jj)
-                {
-                    Unzval_br_dat[Unzval_br_cnt + jj] = Unzval_br_ptr[lb][jj];
-                }
-                SUPERLU_FREE(Unzval_br_ptr[lb]);
-                Unzval_br_ptr[lb] = &Unzval_br_dat[Unzval_br_cnt];
-                tmp_cnt = Unzval_br_offset[lb];
-                Unzval_br_offset[lb] = Unzval_br_cnt;
-                Unzval_br_cnt += tmp_cnt;
-            }
-
-            if (Ufstnz_br_ptr[lb] != NULL)
-            {
-                for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj)
-                {
-                    Ufstnz_br_dat[Ufstnz_br_cnt + jj] = Ufstnz_br_ptr[lb][jj];
-                }
-                SUPERLU_FREE(Ufstnz_br_ptr[lb]);
-                Ufstnz_br_ptr[lb] = &Ufstnz_br_dat[Ufstnz_br_cnt];
-                tmp_cnt = Ufstnz_br_offset[lb];
-                Ufstnz_br_offset[lb] = Ufstnz_br_cnt;
-                Ufstnz_br_cnt += tmp_cnt;
-            }
-        }
-
-        k = CEILING(nsupers, grid->npcol); /* Number of local block columns */
-        Ucb_valcnt = 0;
-        Ucb_indcnt = 0;
-        for (lb = 0; lb < k; ++lb)
-        { /* for each block row ... */
-            if (Ucb_valptr[lb] != NULL)
-            {
-                for (jj = 0; jj < Ucb_valoffset[lb]; ++jj)
-                {
-                    Ucb_valdat[Ucb_valcnt + jj] = Ucb_valptr[lb][jj];
-                }
-                SUPERLU_FREE(Ucb_valptr[lb]);
-                Ucb_valptr[lb] = &Ucb_valdat[Ucb_valcnt];
-                tmp_cnt = Ucb_valoffset[lb];
-                Ucb_valoffset[lb] = Ucb_valcnt;
-                Ucb_valcnt += tmp_cnt;
-            }
-            if (Ucb_indptr[lb] != NULL)
-            {
-                for (jj = 0; jj < Ucb_indoffset[lb]; ++jj)
-                {
-                    Ucb_inddat[Ucb_indcnt + jj] = Ucb_indptr[lb][jj];
-                }
-                SUPERLU_FREE(Ucb_indptr[lb]);
-                Ucb_indptr[lb] = &Ucb_inddat[Ucb_indcnt];
-                tmp_cnt = Ucb_indoffset[lb];
-                Ucb_indoffset[lb] = Ucb_indcnt;
-                Ucb_indcnt += tmp_cnt;
-            }
-        } /* for lb ... */
-
-        /////////////////////////////////////////////////////////////////
-
 #if (PROFlevel >= 1)
         t = SuperLU_timer_();
 #endif
-        if (!grid3d->zscp.Iam)
-        { /* construct the Bcast tree for L ... */
-
-            k = CEILING(nsupers, grid->npcol); /* Number of local block columns */
-            if (!(LBtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree))))
-                ABORT("Malloc fails for LBtree_ptr[].");
-            if (!(ActiveFlag = intCalloc_dist(grid->nprow * 2)))
-                ABORT("Calloc fails for ActiveFlag[].");
-            if (!(ranks = (int *)SUPERLU_MALLOC(grid->nprow * sizeof(int))))
-                ABORT("Malloc fails for ranks[].");
-            if (!(SeedSTD_BC = (double *)SUPERLU_MALLOC(k * sizeof(double))))
-                ABORT("Malloc fails for SeedSTD_BC[].");
-
-            for (i = 0; i < k; i++)
-            {
-                SeedSTD_BC[i] = rand();
-            }
-
-            MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_BC[0], k, MPI_DOUBLE, MPI_MAX, grid->cscp.comm);
-
-            for (ljb = 0; ljb < k; ++ljb)
-            {
-                C_BcTree_Nullify(&LBtree_ptr[ljb]);
-            }
-
-            if (!(ActiveFlagAll = intMalloc_dist(grid->nprow * k)))
-                ABORT("Calloc fails for ActiveFlag[].");
-            memTRS += k * sizeof(C_Tree) + k * dword + grid->nprow * k * iword; // acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll
-            for (j = 0; j < grid->nprow * k; ++j)
-                ActiveFlagAll[j] = 3 * nsupers;
-            for (ljb = 0; ljb < k; ++ljb)
-            {                                   /* for each local block column ... */
-                jb = mycol + ljb * grid->npcol; /* not sure */
-#if 1
-                // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam )
-#endif
-                {
-                    if (jb < nsupers)
-                    {
-                        pc = PCOL(jb, grid);
-                        fsupc = FstBlockC(jb);
-                        nsupc = SuperSize(jb);
 
-                        istart = xlsub[fsupc];
-                        for (i = istart; i < xlsub[fsupc + 1]; ++i)
-                        {
-                            irow = lsub[i];
-                            gb = BlockNum(irow);
-                            pr = PROW(gb, grid);
-                            ActiveFlagAll[pr + ljb * grid->nprow] = SUPERLU_MIN(ActiveFlagAll[pr + ljb * grid->nprow], gb);
-                        } /* for j ... */
-                    }
-                }
-            }
 
-            for (ljb = 0; ljb < k; ++ljb)
-            { /* for each local block column ... */
+        Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+        Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+        Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+        Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+        Llu->Unzval_br_ptr = Unzval_br_ptr;
+        Llu->Unnz = Unnz;
+        Llu->ToRecv = ToRecv;
+        Llu->ToSendD = ToSendD;
+        Llu->ToSendR = ToSendR;
+        Llu->fmod = fmod;
+        Llu->fsendx_plist = fsendx_plist;
+        Llu->nfrecvx = nfrecvx;
+        Llu->nfsendx = nfsendx;
+        Llu->bmod = bmod;
+        Llu->bsendx_plist = bsendx_plist;
+        Llu->nbrecvx = nbrecvx;
+        Llu->nbsendx = nbsendx;
+        Llu->ilsum = ilsum;
+        Llu->ldalsum = ldaspa;
+        Llu->Linv_bc_ptr = Linv_bc_ptr;
+        Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+        Llu->Urbs = Urbs;
+        Llu->Ucb_indptr = Ucb_indptr;
+        Llu->Ucb_valptr = Ucb_valptr;
 
-                jb = mycol + ljb * grid->npcol; /* not sure */
-#if 1
-                // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam )
+#if (PRNTlevel >= 1)
+        if (!iam)
+            printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+                   nLblocks, nUblocks);
 #endif
-                {
-                    if (jb < nsupers)
-                    {
-                        pc = PCOL(jb, grid);
-
-                        for (j = 0; j < grid->nprow; ++j)
-                            ActiveFlag[j] = ActiveFlagAll[j + ljb * grid->nprow];
-                        for (j = 0; j < grid->nprow; ++j)
-                            ActiveFlag[j + grid->nprow] = j;
-                        for (j = 0; j < grid->nprow; ++j)
-                            ranks[j] = -1;
-
-                        Root = -1;
-                        Iactive = 0;
-                        for (j = 0; j < grid->nprow; ++j)
-                        {
-                            if (ActiveFlag[j] != 3 * nsupers)
-                            {
-                                gb = ActiveFlag[j];
-                                pr = PROW(gb, grid);
-                                if (gb == jb)
-                                    Root = pr;
-                                if (myrow == pr)
-                                    Iactive = 1;
-                            }
-                        }
-
-                        quickSortM(ActiveFlag, 0, grid->nprow - 1, grid->nprow, 0, 2);
-
-                        if (Iactive == 1)
-                        {
-
-                            assert(Root > -1);
-                            rank_cnt = 1;
-                            ranks[0] = Root;
-                            for (j = 0; j < grid->nprow; ++j)
-                            {
-                                if (ActiveFlag[j] != 3 * nsupers && ActiveFlag[j + grid->nprow] != Root)
-                                {
-                                    ranks[rank_cnt] = ActiveFlag[j + grid->nprow];
-                                    ++rank_cnt;
-                                }
-                            }
-
-                            if (rank_cnt > 1)
-                            {
-
-                                for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks
-                                    ranks[ii] = PNUM(ranks[ii], pc, grid);
 
-                                msgsize = SuperSize(jb);
-
-                                C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd');
-                                LBtree_ptr[ljb].tag_ = BC_L;
-
-                                if (Root == myrow)
-                                {
-                                    rank_cnt_ref = 1;
-                                    for (j = 0; j < grid->nprow; ++j)
-                                    {
-                                        if (fsendx_plist[ljb][j] != SLU_EMPTY)
-                                        {
-                                            ++rank_cnt_ref;
-                                        }
-                                    }
-                                    assert(rank_cnt == rank_cnt_ref);
-                                }
-                                // #endif
-                            }
-                        }
-                    }
-                }
-            }
+        SUPERLU_FREE(rb_marker);
+        SUPERLU_FREE(Urb_fstnz);
+        SUPERLU_FREE(Urb_length);
+        SUPERLU_FREE(Urb_indptr);
+        SUPERLU_FREE(Lrb_length);
+        SUPERLU_FREE(Lrb_number);
+        SUPERLU_FREE(Lrb_indptr);
+        SUPERLU_FREE(Lrb_valptr);
+        SUPERLU_FREE(dense);
 
-            SUPERLU_FREE(ActiveFlag);
-            SUPERLU_FREE(ActiveFlagAll);
-            SUPERLU_FREE(ranks);
-            SUPERLU_FREE(SeedSTD_BC);
-            memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_BC, ActiveFlagAll
+        /* Find the maximum buffer size. */
+        MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
+                      MPI_MAX, grid->comm);
 
-#if (PROFlevel >= 1)
-            t = SuperLU_timer_() - t;
-            if (!iam)
-                printf(".. Construct Bcast tree for L: %.2f\t\n", t);
-#endif
+        k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
+        if (!(Llu->mod_bit = int32Malloc_dist(k)))
+            ABORT("Malloc fails for mod_bit[].");
 
 #if (PROFlevel >= 1)
-            t = SuperLU_timer_();
-#endif
-            /* construct the Reduce tree for L ... */
-            /* the following is used as reference */
-            nlb = CEILING(nsupers, grid->nprow); /* Number of local block rows */
-            if (!(mod_bit = int32Malloc_dist(nlb)))
-                ABORT("Malloc fails for mod_bit[].");
-            if (!(frecv = int32Malloc_dist(nlb)))
-                ABORT("Malloc fails for frecv[].");
-
-            for (k = 0; k < nlb; ++k)
-                mod_bit[k] = 0;
-            for (k = 0; k < nsupers; ++k)
-            {
-                pr = PROW(k, grid);
-                if (myrow == pr)
-                {
-                    lib = LBi(k, grid); /* local block number */
-                    kcol = PCOL(k, grid);
-                    if (mycol == kcol || fmod[lib])
-                        mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/
-                }
-            }
-            /* Every process receives the count, but it is only useful on the
-               diagonal processes.  */
-#if 0 // Sherry: 1/26/2022	   
-	MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
-#else
-            MPI_Allreduce(mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
+        if (!iam)
+            printf(".. 1st distribute time:\n "
+                   "\tL\t%.2f\n\tU\t%.2f\n"
+                   "\tu_blks %d\tnrbu %d\n--------\n",
+                   t_l, t_u, u_blks, nrbu);
 #endif
 
-            k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
-            if (!(LRtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree))))
-                ABORT("Malloc fails for LRtree_ptr[].");
-            if (!(ActiveFlag = intCalloc_dist(grid->npcol * 2)))
-                ABORT("Calloc fails for ActiveFlag[].");
-            if (!(ranks = (int *)SUPERLU_MALLOC(grid->npcol * sizeof(int))))
-                ABORT("Malloc fails for ranks[].");
-
-            if (!(SeedSTD_RD = (double *)SUPERLU_MALLOC(k * sizeof(double))))
-                ABORT("Malloc fails for SeedSTD_RD[].");
-
-            for (i = 0; i < k; i++)
-            {
-                SeedSTD_RD[i] = rand();
-            }
-
-            MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_RD[0], k, MPI_DOUBLE, MPI_MAX, grid->rscp.comm);
-
-            for (lib = 0; lib < k; ++lib)
-            {
-                C_RdTree_Nullify(&LRtree_ptr[lib]);
-            }
+    } /* else fact != SamePattern_SameRowPerm */
 
-            if (!(ActiveFlagAll = intMalloc_dist(grid->npcol * k)))
-                ABORT("Calloc fails for ActiveFlagAll[].");
-            for (j = 0; j < grid->npcol * k; ++j)
-                ActiveFlagAll[j] = -3 * nsupers;
-            memTRS += k * sizeof(C_Tree) + k * dword + grid->npcol * k * iword; // acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll
-            for (jb = 0; jb < nsupers; ++jb)
-            { /* for each block column ... */
-                fsupc = FstBlockC(jb);
-                pc = PCOL(jb, grid);
-#if 1
-                // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam )
-#endif
-                {
-                    for (i = xlsub[fsupc]; i < xlsub[fsupc + 1]; ++i)
-                    {
-                        irow = lsub[i];
-                        ib = BlockNum(irow);
-                        pr = PROW(ib, grid);
-                        if (myrow == pr)
-                        {                        /* Block row ib in my process row */
-                            lib = LBi(ib, grid); /* Local block number */
-                            ActiveFlagAll[pc + lib * grid->npcol] = SUPERLU_MAX(ActiveFlagAll[pc + lib * grid->npcol], jb);
-                        }
-                    }
-                }
-            }
+    if (xa[A->ncol] > 0)
+    { /* may not have any entries on this process. */
+        SUPERLU_FREE(asub);
+        SUPERLU_FREE(a);
+    }
+    SUPERLU_FREE(xa);
 
-            for (lib = 0; lib < k; ++lib)
-            {
-                ib = myrow + lib * grid->nprow; /* not sure */
-#if 1
-                // if (superGridMap[ib] != NOT_IN_GRID || !grid3d->zscp.Iam )
+#if (DEBUGlevel >= 1)
+    /* Memory allocated but not freed:
+       ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
+    CHECK_MALLOC(iam, "Exit pddistribute3d()");
 #endif
-                {
-                    if (ib < nsupers)
-                    {
-                        pr = PROW(ib, grid);
-                        for (j = 0; j < grid->npcol; ++j)
-                            ActiveFlag[j] = ActiveFlagAll[j + lib * grid->npcol];
-                        ;
-                        for (j = 0; j < grid->npcol; ++j)
-                            ActiveFlag[j + grid->npcol] = j;
-                        for (j = 0; j < grid->npcol; ++j)
-                            ranks[j] = -1;
-                        Root = -1;
-                        Iactive = 0;
-
-                        for (j = 0; j < grid->npcol; ++j)
-                        {
-                            if (ActiveFlag[j] != -3 * nsupers)
-                            {
-                                jb = ActiveFlag[j];
-                                pc = PCOL(jb, grid);
-                                if (jb == ib)
-                                    Root = pc;
-                                if (mycol == pc)
-                                    Iactive = 1;
-                            }
-                        }
-
-                        quickSortM(ActiveFlag, 0, grid->npcol - 1, grid->npcol, 1, 2);
-
-                        if (Iactive == 1)
-                        {
-                            assert(Root > -1);
-                            rank_cnt = 1;
-                            ranks[0] = Root;
-                            for (j = 0; j < grid->npcol; ++j)
-                            {
-                                if (ActiveFlag[j] != -3 * nsupers && ActiveFlag[j + grid->npcol] != Root)
-                                {
-                                    ranks[rank_cnt] = ActiveFlag[j + grid->npcol];
-                                    ++rank_cnt;
-                                }
-                            }
-                            if (rank_cnt > 1)
-                            {
-
-                                for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks
-                                    ranks[ii] = PNUM(pr, ranks[ii], grid);
-
-                                msgsize = SuperSize(ib);
 
-                                C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd');
-                                LRtree_ptr[lib].tag_ = RD_L;
-                            }
-                        }
-                    }
-                }
-            }
-
-            SUPERLU_FREE(mod_bit);
-            SUPERLU_FREE(frecv);
+    return (mem_use + memTRS);
 
-            SUPERLU_FREE(ActiveFlag);
-            SUPERLU_FREE(ActiveFlagAll);
-            SUPERLU_FREE(ranks);
+} /* PDDISTRIBUTE3D */
 
-            SUPERLU_FREE(SeedSTD_RD);
 
-            memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_RD, ActiveFlagAll
-                                                           ////////////////////////////////////////////////////////
 
-#if (PROFlevel >= 1)
-            t = SuperLU_timer_() - t;
-            if (!iam)
-                printf(".. Construct Reduce tree for L: %.2f\t\n", t);
-#endif
-
-#if (PROFlevel >= 1)
-            t = SuperLU_timer_();
-#endif
+float
+pddistribute3d_Yang(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
+	     dScalePermstruct_t *ScalePermstruct,
+	     Glu_freeable_t *Glu_freeable, dLUstruct_t *LUstruct,
+	     gridinfo3d_t *grid3d)
+/*
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh on all girds based on superGridMap created by Piyush
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *        options->Fact Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) dScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * LUstruct (input) dLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage required (in bytes).
+ *
+ */
+{
+    gridinfo_t *grid = &(grid3d->grid2d);
+    dtrf3Dpartition_t *trf3Dpart = LUstruct->trf3Dpart; /* Data structure containing 3D partition info */
+    SupernodeToGridMap_t *superGridMap = trf3Dpart->superGridMap;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    dLocalLU_t *Llu = LUstruct->Llu;
+    int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1,
+          len, len1, nsupc, masked;
+	int_t lib;  /* local block row number */
+	int_t nlb;  /* local block rows*/
+    int_t ljb;  /* local block column number */
+    int_t nrbl; /* number of L blocks in current block column */
+    int_t nrbu; /* number of U blocks in current block column */
+    int_t gb;   /* global block number; 0 < gb <= nsuper */
+    int_t lb;   /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */
+	int_t ub,gik,iklrow,fnz;
+	int iam, jbrow, kcol, krow, mycol, myrow, pc, pr;
+    int_t mybufmax[NBUFFERS];
+    NRformat_loc *Astore;
+    double *a;
+    int_t *asub, *xa;
+    int_t *xa_begin, *xa_end;
+    int_t *xsup = Glu_persist->xsup;    /* supernode and column mapping */
+    int_t *supno = Glu_persist->supno;
+    int_t *lsub, *xlsub, *usub, *usub1, *xusub;
+    int_t nsupers;
+    int_t next_lind;      /* next available position in index[*] */
+    int_t next_lval;      /* next available position in nzval[*] */
+    int_t *index;         /* indices consist of headers and row subscripts */
+	int_t *index_srt;         /* indices consist of headers and row subscripts */
+	int   *index1;        /* temporary pointer to array of int */
+    double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
 
-            /* construct the Bcast tree for U ... */
+	double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+	double *Lnzval_bc_dat;  /* size sum of sizes of Lnzval_bc_ptr[lk])                 */
+    long int *Lnzval_bc_offset;  /* size ceil(NSUPERS/Pc)                 */
+
+	int_t  **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+	int_t *Lrowind_bc_dat;  /* size sum of sizes of Lrowind_bc_ptr[lk])                 */
+    long int *Lrowind_bc_offset;  /* size ceil(NSUPERS/Pc)                 */
+
+	int_t  **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
+	int_t *Lindval_loc_bc_dat;  /* size sum of sizes of Lindval_loc_bc_ptr[lk])                 */
+    long int *Lindval_loc_bc_offset;  /* size ceil(NSUPERS/Pc)                 */
+
+	int_t   *Unnz; /* size ceil(NSUPERS/Pc)                 */
+	double **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr) */
+	double *Unzval_br_dat;  /* size sum of sizes of Unzval_br_ptr[lk])                 */
+	long int *Unzval_br_offset;  /* size ceil(NSUPERS/Pr)    */
+    long int Unzval_br_cnt=0;
+	int_t  **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr) */
+    int_t   *Ufstnz_br_dat;  /* size sum of sizes of Ufstnz_br_ptr[lk])                 */
+    long int *Ufstnz_br_offset;  /* size ceil(NSUPERS/Pr)    */
+    long int Ufstnz_br_cnt=0;
+
+	C_Tree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	C_Tree  *LRtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	C_Tree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
+	C_Tree  *URtree_ptr;		  /* size ceil(NSUPERS/Pr)                */
+	int msgsize;
+
+    int_t  *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
+    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+    Ucb_indptr_t *Ucb_inddat;
+    long int *Ucb_indoffset;
+    long int Ucb_indcnt=0;
 
-            k = CEILING(nsupers, grid->npcol); /* Number of local block columns */
-            if (!(UBtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree))))
-                ABORT("Malloc fails for UBtree_ptr[].");
-            if (!(ActiveFlag = intCalloc_dist(grid->nprow * 2)))
-                ABORT("Calloc fails for ActiveFlag[].");
-            if (!(ranks = (int *)SUPERLU_MALLOC(grid->nprow * sizeof(int))))
-                ABORT("Malloc fails for ranks[].");
-            if (!(SeedSTD_BC = (double *)SUPERLU_MALLOC(k * sizeof(double))))
-                ABORT("Malloc fails for SeedSTD_BC[].");
+	int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    int_t  *Ucb_valdat;
+    long int *Ucb_valoffset;
+    long int Ucb_valcnt=0;
 
-            for (i = 0; i < k; i++)
-            {
-                SeedSTD_BC[i] = rand();
-            }
+	/*-- Counts to be used in factorization. --*/
+    int  *ToRecv, *ToSendD, **ToSendR;
 
-            MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_BC[0], k, MPI_DOUBLE, MPI_MAX, grid->cscp.comm);
+    /*-- Counts to be used in lower triangular solve. --*/
+    int  *fmod;          /* Modification count for L-solve.        */
+    int  **fsendx_plist; /* Column process list to send down Xk.   */
+    int  nfrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nfsendx = 0;    /* Number of Xk I will send               */
+    int  kseen;
 
-            for (ljb = 0; ljb < k; ++ljb)
-            {
-                C_BcTree_Nullify(&UBtree_ptr[ljb]);
-            }
+    /*-- Counts to be used in upper triangular solve. --*/
+    int  *bmod;          /* Modification count for U-solve.        */
+    int  **bsendx_plist; /* Column process list to send down Xk.   */
+    int  nbrecvx = 0;    /* Number of Xk I will receive.           */
+    int  nbsendx = 0;    /* Number of Xk I will send               */
 
-            if (!(ActiveFlagAll = intMalloc_dist(grid->nprow * k)))
-                ABORT("Calloc fails for ActiveFlagAll[].");
-            for (j = 0; j < grid->nprow * k; ++j)
-                ActiveFlagAll[j] = -3 * nsupers;
-            memTRS += k * sizeof(C_Tree) + k * dword + grid->nprow * k * iword; // acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll
+    int_t  *ilsum;       /* starting position of each supernode in
+		            the full array (local)                 */
 
-            for (ljb = 0; ljb < k; ++ljb)
-            {                                   /* for each local block column ... */
-                jb = mycol + ljb * grid->npcol; /* not sure */
-#if 1
-                // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam )
-#endif
-                {
-                    if (jb < nsupers)
-                    {
-                        pc = PCOL(jb, grid);
+    /*-- Auxiliary arrays; freed on return --*/
+    int_t *rb_marker;  /* block hit marker; size ceil(NSUPERS/Pr)           */
+    int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr)             */
+    int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Urb_fstnz;  /* # of fstnz in a block row; size ceil(NSUPERS/Pr)  */
+    int_t *Ucbs;       /* number of column blocks in a block row            */
+    int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr)             */
+    int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr)        */
+    int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr)      */
+    int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr)      */
+	int_t *ActiveFlag;
+	int_t *ActiveFlagAll;
+	int_t Iactive;
+	int *ranks;
+	int_t *idxs;
+	int_t **nzrows;
+	double rseed;
+	int rank_cnt,rank_cnt_ref,Root;
+	double *dense, *dense_col; /* SPA */
+    double zero = 0.0;
+    int_t ldaspa;     /* LDA of SPA */
+    int_t iword, dword;
+    float mem_use = 0.0;
+    float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
 
-                        fsupc = FstBlockC(jb);
-                        for (j = fsupc; j < FstBlockC(jb + 1); ++j)
-                        {
-                            istart = xusub[j];
-                            /* NOTE: Only the first nonzero index of the segment
-                               is stored in usub[]. */
-                            for (i = istart; i < xusub[j + 1]; ++i)
-                            {
-                                irow = usub[i]; /* First nonzero in the segment. */
-                                gb = BlockNum(irow);
-                                pr = PROW(gb, grid);
-                                ActiveFlagAll[pr + ljb * grid->nprow] = SUPERLU_MAX(ActiveFlagAll[pr + ljb * grid->nprow], gb);
-                            }
-                        }
-                        pr = PROW(jb, grid); // take care of diagonal node stored as L
+    int *mod_bit;
+    int *frecv, *brecv;
+    int_t *lloc;
+    double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+	double *Linv_bc_dat;  /* size sum of sizes of Linv_bc_ptr[lk])                 */
+    long int *Linv_bc_offset;  /* size ceil(NSUPERS/Pc)                 */
+    double **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc) */
+	double *Uinv_bc_dat;  /* size sum of sizes of Uinv_bc_ptr[lk])                 */
+    long int *Uinv_bc_offset;  /* size ceil(NSUPERS/Pc)     */
+    double *SeedSTD_BC,*SeedSTD_RD;
+    int_t idx_indx,idx_lusup;
+    int_t nbrow;
+    int_t  ik, il, lk, rel, knsupc, idx_r;
+    int_t  lptr1_tmp, idx_i, idx_v,m, uu;
+    int_t nub;
+    int tag;
 
-                        ActiveFlagAll[pr + ljb * grid->nprow] = SUPERLU_MAX(ActiveFlagAll[pr + ljb * grid->nprow], jb);
-                    }
-                }
-            }
 
-            for (ljb = 0; ljb < k; ++ljb)
-            {                                   /* for each block column ... */
-                jb = mycol + ljb * grid->npcol; /* not sure */
-#if 1
-                // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam )
+#if ( PRNTlevel>=1 )
+    int_t nLblocks = 0, nUblocks = 0;
+#endif
+#if ( PROFlevel>=1 )
+    double t, t_u, t_l;
+    int_t u_blks;
 #endif
-                {
-                    if (jb < nsupers)
-                    {
-                        pc = PCOL(jb, grid);
-                        // if ( mycol == pc ) { /* Block column jb in my process column */
-
-                        for (j = 0; j < grid->nprow; ++j)
-                            ActiveFlag[j] = ActiveFlagAll[j + ljb * grid->nprow];
-                        for (j = 0; j < grid->nprow; ++j)
-                            ActiveFlag[j + grid->nprow] = j;
-                        for (j = 0; j < grid->nprow; ++j)
-                            ranks[j] = -1;
-
-                        Root = -1;
-                        Iactive = 0;
-                        for (j = 0; j < grid->nprow; ++j)
-                        {
-                            if (ActiveFlag[j] != -3 * nsupers)
-                            {
-                                gb = ActiveFlag[j];
-                                pr = PROW(gb, grid);
-                                if (gb == jb)
-                                    Root = pr;
-                                if (myrow == pr)
-                                    Iactive = 1;
-                            }
-                        }
-
-                        quickSortM(ActiveFlag, 0, grid->nprow - 1, grid->nprow, 1, 2);
-
-                        if (Iactive == 1)
-                        {
-
-                            assert(Root > -1);
-                            rank_cnt = 1;
-                            ranks[0] = Root;
-                            for (j = 0; j < grid->nprow; ++j)
-                            {
-                                if (ActiveFlag[j] != -3 * nsupers && ActiveFlag[j + grid->nprow] != Root)
-                                {
-                                    ranks[rank_cnt] = ActiveFlag[j + grid->nprow];
-                                    ++rank_cnt;
-                                }
-                            }
-
-                            if (rank_cnt > 1)
-                            {
-                                for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks
-                                    ranks[ii] = PNUM(ranks[ii], pc, grid);
-
-                                msgsize = SuperSize(jb);
-
-                                C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd');
-                                UBtree_ptr[ljb].tag_ = BC_U;
-
-                                if (Root == myrow)
-                                {
-                                    rank_cnt_ref = 1;
-                                    for (j = 0; j < grid->nprow; ++j)
-                                    {
 
-                                        if (bsendx_plist[ljb][j] != SLU_EMPTY)
-                                        {
-                                            ++rank_cnt_ref;
-                                        }
-                                    }
+    /* Initialization. */
+    iam = grid->iam;
+    myrow = MYROW( iam, grid );
+    mycol = MYCOL( iam, grid );
+    for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0;
+    nsupers  = supno[n-1] + 1;
+    Astore   = (NRformat_loc *) A->Store;
 
-                                    assert(rank_cnt == rank_cnt_ref);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            SUPERLU_FREE(ActiveFlag);
-            SUPERLU_FREE(ActiveFlagAll);
-            SUPERLU_FREE(ranks);
-            SUPERLU_FREE(SeedSTD_BC);
-            memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_BC, ActiveFlagAll
+//#if ( PRNTlevel>=1 )
+    iword = sizeof(int_t);
+    dword = sizeof(double);
+//#endif
 
-#if (PROFlevel >= 1)
-            t = SuperLU_timer_() - t;
-            if (!iam)
-                printf(".. Construct Bcast tree for U: %.2f\t\n", t);
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Enter pddistribute_allgrid()");
 #endif
-
-#if (PROFlevel >= 1)
-            t = SuperLU_timer_();
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_();
 #endif
-            /* construct the Reduce tree for U ... */
-            /* the following is used as reference */
-            nlb = CEILING(nsupers, grid->nprow); /* Number of local block rows */
-            if (!(mod_bit = int32Malloc_dist(nlb)))
-                ABORT("Malloc fails for mod_bit[].");
-            if (!(brecv = int32Malloc_dist(nlb)))
-                ABORT("Malloc fails for brecv[].");
-
-            for (k = 0; k < nlb; ++k)
-                mod_bit[k] = 0;
-            for (k = 0; k < nsupers; ++k)
-            {
-                pr = PROW(k, grid);
-                if (myrow == pr)
-                {
-                    lib = LBi(k, grid); /* local block number */
-                    kcol = PCOL(k, grid);
-                    if (mycol == kcol || bmod[lib])
-                        mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/
-                }
-            }
-            /* Every process receives the count, but it is only useful on the
-               diagonal processes.  */
-            MPI_Allreduce(mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
-            k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
-            if (!(URtree_ptr = (C_Tree *)SUPERLU_MALLOC(k * sizeof(C_Tree))))
-                ABORT("Malloc fails for URtree_ptr[].");
-            if (!(ActiveFlag = intCalloc_dist(grid->npcol * 2)))
-                ABORT("Calloc fails for ActiveFlag[].");
-            if (!(ranks = (int *)SUPERLU_MALLOC(grid->npcol * sizeof(int))))
-                ABORT("Malloc fails for ranks[].");
-
-            if (!(SeedSTD_RD = (double *)SUPERLU_MALLOC(k * sizeof(double))))
-                ABORT("Malloc fails for SeedSTD_RD[].");
-
-            for (i = 0; i < k; i++)
-            {
-                SeedSTD_RD[i] = rand();
-            }
-
-            MPI_Allreduce(MPI_IN_PLACE, &SeedSTD_RD[0], k, MPI_DOUBLE, MPI_MAX, grid->rscp.comm);
-            for (lib = 0; lib < k; ++lib)
-            {
-                C_RdTree_Nullify(&URtree_ptr[lib]);
-            }
 
-            if (!(ActiveFlagAll = intMalloc_dist(grid->npcol * k)))
-                ABORT("Calloc fails for ActiveFlagAll[].");
-            for (j = 0; j < grid->npcol * k; ++j)
-                ActiveFlagAll[j] = 3 * nsupers;
-            memTRS += k * sizeof(C_Tree) + k * dword + grid->npcol * k * iword; // acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll
+    dReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno,
+		      grid, &xa, &asub, &a);
 
-            for (jb = 0; jb < nsupers; ++jb)
-            { /* for each block column ... */
-#if 1
-                // if (superGridMap[jb] != NOT_IN_GRID || !grid3d->zscp.Iam )
+#if ( PROFlevel>=1 )
+    t = SuperLU_timer_() - t;
+    if ( !iam ) printf("--------\n"
+		       ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t);
 #endif
-                {
-                    fsupc = FstBlockC(jb);
-                    pc = PCOL(jb, grid);
 
-                    fsupc = FstBlockC(jb);
-                    for (j = fsupc; j < FstBlockC(jb + 1); ++j)
-                    {
-                        istart = xusub[j];
-                        /* NOTE: Only the first nonzero index of the segment
-                           is stored in usub[]. */
-                        for (i = istart; i < xusub[j + 1]; ++i)
-                        {
-                            irow = usub[i]; /* First nonzero in the segment. */
-                            ib = BlockNum(irow);
-                            pr = PROW(ib, grid);
-                            if (myrow == pr)
-                            {                        /* Block row ib in my process row */
-                                lib = LBi(ib, grid); /* Local block number */
-                                ActiveFlagAll[pc + lib * grid->npcol] = SUPERLU_MIN(ActiveFlagAll[pc + lib * grid->npcol], jb);
-                            }
-                        }
-                    }
+    if ( options->Fact == SamePattern_SameRowPerm ) {
 
-                    pr = PROW(jb, grid);
-                    if (myrow == pr)
-                    {                        /* Block row ib in my process row */
-                        lib = LBi(jb, grid); /* Local block number */
-                        ActiveFlagAll[pc + lib * grid->npcol] = SUPERLU_MIN(ActiveFlagAll[pc + lib * grid->npcol], jb);
-                    }
-                }
-            }
-
-            for (lib = 0; lib < k; ++lib)
-            {
-                ib = myrow + lib * grid->nprow; /* not sure */
-#if 1
-                // if (superGridMap[ib] != NOT_IN_GRID || !grid3d->zscp.Iam)
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We can propagate the new values of A into the existing
+	   L and U data structures.            */
+	ilsum = Llu->ilsum;
+	ldaspa = Llu->ldalsum;
+	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3,options))) )
+	    ABORT("Calloc fails for SPA dense[].");
+	nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
+	if ( !(Urb_length = intCalloc_dist(nrbu)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(nrbu)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr;
+	Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	Unzval_br_ptr = Llu->Unzval_br_ptr;
+	Unnz = Llu->Unnz;
+
+	mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3,options)*dword;
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
 #endif
-                {
-                    if (ib < nsupers)
-                    {
-                        pr = PROW(ib, grid);
-                        for (j = 0; j < grid->npcol; ++j)
-                            ActiveFlag[j] = ActiveFlagAll[j + lib * grid->npcol];
-                        ;
-                        for (j = 0; j < grid->npcol; ++j)
-                            ActiveFlag[j + grid->npcol] = j;
-                        for (j = 0; j < grid->npcol; ++j)
-                            ranks[j] = -1;
-                        Root = -1;
-                        Iactive = 0;
-
-                        for (j = 0; j < grid->npcol; ++j)
-                        {
-                            if (ActiveFlag[j] != 3 * nsupers)
-                            {
-                                jb = ActiveFlag[j];
-                                pc = PCOL(jb, grid);
-                                if (jb == ib)
-                                    Root = pc;
-                                if (mycol == pc)
-                                    Iactive = 1;
-                            }
-                        }
 
-                        quickSortM(ActiveFlag, 0, grid->npcol - 1, grid->npcol, 0, 2);
+	/* Initialize Uval to zero. */
+	for (lb = 0; lb < nrbu; ++lb) {
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+	    index = Ufstnz_br_ptr[lb];
+	    if ( index ) {
+		uval = Unzval_br_ptr[lb];
+		len = index[1];
+		for (i = 0; i < len; ++i) uval[i] = zero;
+	    } /* if index != NULL */
+	} /* for lb ... */
+
+	for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+
+ 		/* Scatter A into SPA (for L), or into U directly. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+ 			    if ( gb < jb ) { /* in U */
+					index = Ufstnz_br_ptr[lb];
+					uval = Unzval_br_ptr[lb];
+					if(index){
+						while (  (k = index[Urb_indptr[lb]]) < jb ) {
+							/* Skip nonzero values in this block */
+							Urb_length[lb] += index[Urb_indptr[lb]+1];
+							/* Move pointer to the next block */
+							Urb_indptr[lb] += UB_DESCRIPTOR
+							+ SuperSize( k );
+						}
+						/*assert(k == jb);*/
+						/* start fstnz */
+						istart = Urb_indptr[lb] + UB_DESCRIPTOR;
+						len = Urb_length[lb];
+						fsupc1 = FstBlockC( gb+1 );
+						k = j - fsupc;
+						/* Sum the lengths of the leading columns */
+						for (jj = 0; jj < k; ++jj)
+							len += fsupc1 - index[istart++];
+						/*assert(irow>=index[istart]);*/
+						uval[len + irow - index[istart]] = a[i];
+					}
+			    } else { /* in L; put in SPA first */
+					irow = ilsum[lb] + irow - FstBlockC( gb );
+					dense_col[irow] = a[i];
+  			    }
+  			}
+		    } /* for i ... */
+  		    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
 
-                        if (Iactive == 1)
-                        {
-                            assert(Root > -1);
-                            rank_cnt = 1;
-                            ranks[0] = Root;
-                            for (j = 0; j < grid->npcol; ++j)
-                            {
-                                if (ActiveFlag[j] != 3 * nsupers && ActiveFlag[j + grid->npcol] != Root)
-                                {
-                                    ranks[rank_cnt] = ActiveFlag[j + grid->npcol];
-                                    ++rank_cnt;
-                                }
-                            }
-                            if (rank_cnt > 1)
-                            {
+		/* Gather the values of A from SPA into Lnzval[]. */
+		ljb = LBj( jb, grid ); /* Local block number */
+		index = Lrowind_bc_ptr[ljb];
+		if ( index ) {
+		    nrbl = index[0];   /* Number of row blocks. */
+		    len = index[1];    /* LDA of lusup[]. */
+		    lusup = Lnzval_bc_ptr[ljb];
+		    next_lind = BC_HEADER;
+		    next_lval = 0;
+		    for (jj = 0; jj < nrbl; ++jj) {
+			gb = index[next_lind++];
+			len1 = index[next_lind++]; /* Rows in the block. */
+			lb = LBi( gb, grid );
+			for (bnnz = 0; bnnz < len1; ++bnnz) {
+			    irow = index[next_lind++]; /* Global index. */
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    k = next_lval++;
+			    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+				lusup[k] = dense_col[irow];
+				dense_col[irow] = zero;
+				k += len;
+				dense_col += ldaspa;
+			    }
+			} /* for bnnz ... */
+		    } /* for jj ... */
+		} /* if index ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
+#endif
+	    } /* if mycol == pc */
+	} /* for jb ... */
+
+	SUPERLU_FREE(dense);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	mem_use -= 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3,options)*dword;
+	
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n",
+			   t_l, t_u, u_blks, nrbu);
+#endif
 
-                                for (ii = 0; ii < rank_cnt; ii++) // use global ranks rather than local ranks
-                                    ranks[ii] = PNUM(pr, ranks[ii], grid);
+    } else { /* fact is not SamePattern_SameRowPerm */
+        /* ------------------------------------------------------------
+	   FIRST TIME CREATING THE L AND U DATA STRUCTURES.
+	   ------------------------------------------------------------*/
 
-                                msgsize = SuperSize(ib);
+#if ( PROFlevel>=1 )
+	t_l = t_u = 0; u_blks = 0;
+#endif
+	/* We first need to set up the L and U data structures and then
+	 * propagate the values of A into them.
+	 */
+	lsub = Glu_freeable->lsub;    /* compressed L subscripts */
+	xlsub = Glu_freeable->xlsub;
+	usub = Glu_freeable->usub;    /* compressed U subscripts */
+	xusub = Glu_freeable->xusub;
+
+	if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) )
+	    ABORT("Malloc fails for ToRecv[].");
+	for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */
+	if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for ToSendR[].");
+	j = k * grid->npcol;
+	if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
+	    ABORT("Malloc fails for index[].");
+
+	mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;
+
+	for (i = 0; i < j; ++i) index1[i] = SLU_EMPTY;
+	for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+
+	/* Pointers to the beginning of each block row of U. */
+	if ( !(Unzval_br_ptr =
+              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
+	    ABORT("Malloc fails for Unzval_br_ptr[].");
+	if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Ufstnz_br_ptr[].");
+
+
+	if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
+	    ABORT("Malloc fails for ToSendD[].");
+	for (i = 0; i < k; ++i) ToSendD[i] = NO;
+	if ( !(ilsum = intMalloc_dist(k+1)) )
+	    ABORT("Malloc fails for ilsum[].");
+
+	/* Auxiliary arrays used to set up U block data structures.
+	   They are freed on return. */
+	if ( !(rb_marker = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for rb_marker[].");
+	if ( !(Urb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_length[].");
+	if ( !(Urb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Urb_indptr[].");
+	if ( !(Urb_fstnz = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Urb_fstnz[].");
+	if ( !(Ucbs = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Ucbs[].");
+
+	mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword;
+
+	/* Compute ldaspa and ilsum[]. */
+	ldaspa = 0;
+	ilsum[0] = 0;
+	for (gb = 0; gb < nsupers; ++gb) {
+	    if ( myrow == PROW( gb, grid ) ) {
+		i = SuperSize( gb );
+		ldaspa += i;
+		lb = LBi( gb, grid );
+		ilsum[lb + 1] = ilsum[lb] + i;
+	    }
+	}
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_();
+#endif
+	/* ------------------------------------------------------------
+	   COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U.
+	   THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U).
+	   ------------------------------------------------------------*/
+
+	/* Loop through each supernode column. */
+	for (jb = 0; jb < nsupers; ++jb) {
+	    pc = PCOL( jb, grid );
+	    fsupc = FstBlockC( jb );
+	    nsupc = SuperSize( jb );
+	    /* Loop through each column in the block. */
+	    for (j = fsupc; j < fsupc + nsupc; ++j) {
+		/* usub[*] contains only "first nonzero" in each segment. */
+		for (i = xusub[j]; i < xusub[j+1]; ++i) {
+		    irow = usub[i]; /* First nonzero of the segment. */
+		    gb = BlockNum( irow );
+		    kcol = PCOL( gb, grid );
+		    ljb = LBj( gb, grid );
+		    if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES;
+		    pr = PROW( gb, grid );
+		    lb = LBi( gb, grid );
+		    if ( mycol == pc ) {
+			if  ( myrow == pr ) {
+			    ToSendD[lb] = YES;
+			    /* Count nonzeros in entire block row. */
+			    Urb_length[lb] += FstBlockC( gb+1 ) - irow;
+			    if (rb_marker[lb] <= jb) {/* First see the block */
+				rb_marker[lb] = jb + 1;
+				Urb_fstnz[lb] += nsupc;
+				++Ucbs[lb]; /* Number of column blocks
+					       in block row lb. */
+#if ( PRNTlevel>=1 )
+				++nUblocks;
+#endif
+			    }
+			    ToRecv[gb] = 1;
+			} else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */
+		    }
+		} /* for i ... */
+	    } /* for j ... */
+	} /* for jb ... */
+
+	/* Set up the initial pointers for each block row in U. */
+	nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	for (lb = 0; lb < nrbu; ++lb) {
+		ib = myrow+lb*grid->nprow;  /* not sure */
+	    len = Urb_length[lb];
+	    rb_marker[lb] = 0; /* Reset block marker. */
+	    if ( len ) {  
+			/* Add room for descriptors */
+			len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR;
+			mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
+			mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
+
+			if(superGridMap[ib]!= NOT_IN_GRID){ // YL: added supernode mask here
+				if ( !(index = intMalloc_dist(len1+1)) )
+					ABORT("Malloc fails for Uindex[].");
+				Ufstnz_br_ptr[lb] = index;
+				if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
+					ABORT("Malloc fails for Unzval_br_ptr[*][].");
+
+				mem_use += len*dword + (len1+1)*iword;
+
+				index[0] = Ucbs[lb]; /* Number of column blocks */
+				index[1] = len;      /* Total length of nzval[] */
+				index[2] = len1;     /* Total length of index[] */
+				index[len1] = -1;    /* End marker */
+			}else{
+				Ufstnz_br_ptr[lb] = NULL;
+				Unzval_br_ptr[lb] = NULL;				
+			}
+	    } else {
+		Ufstnz_br_ptr[lb] = NULL;
+		Unzval_br_ptr[lb] = NULL;
+	    }
+	    Urb_length[lb] = 0; /* Reset block length. */
+	    Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
+ 	    Urb_fstnz[lb] = BR_HEADER;
+	} /* for lb ... */
+
+	SUPERLU_FREE(Ucbs);
+
+#if ( PROFlevel>=1 )
+	t = SuperLU_timer_() - t;
+	if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t);
+#endif
 
-                                C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd');
-                                URtree_ptr[lib].tag_ = RD_U;
-                                if (Root == mycol)
-                                {
-                                    assert(rank_cnt == brecv[lib]);
-                                }
-                            }
+        mem_use -= 2.0*k * iword;
+
+	/* Auxiliary arrays used to set up L block data structures.
+	   They are freed on return.
+	   k is the number of local row blocks.   */
+	if ( !(Lrb_length = intCalloc_dist(k)) )
+	    ABORT("Calloc fails for Lrb_length[].");
+	if ( !(Lrb_number = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_number[].");
+	if ( !(Lrb_indptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_indptr[].");
+	if ( !(Lrb_valptr = intMalloc_dist(k)) )
+	    ABORT("Malloc fails for Lrb_valptr[].");
+	if ( !(dense = doubleCalloc_dist(ldaspa * sp_ienv_dist(3,options))) )
+	    ABORT("Calloc fails for SPA dense[].");
+
+	/* These counts will be used for triangular solves. */
+	if ( !(fmod = int32Calloc_dist(k)) )
+	    ABORT("Calloc fails for fmod[].");
+	if ( !(bmod = int32Calloc_dist(k)) )
+	    ABORT("Calloc fails for bmod[].");
+
+	/* ------------------------------------------------ */
+	mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3,options)*dword;
+
+	k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+
+	/* Pointers to the beginning of each block column of L. */
+	if ( !(Lnzval_bc_ptr =
+              (double**)SUPERLU_MALLOC(k * sizeof(double*))) )
+	    ABORT("Malloc fails for Lnzval_bc_ptr[].");
+	Lnzval_bc_ptr[k-1] = NULL;
+	if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+	    ABORT("Malloc fails for Lrowind_bc_ptr[].");
+	Lrowind_bc_ptr[k-1] = NULL;
+
+	if ( !(Lindval_loc_bc_ptr =
+				(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
+		ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
+	Lindval_loc_bc_ptr[k-1] = NULL;
+
+	if ( !(Linv_bc_ptr =
+				(double**)SUPERLU_MALLOC(k * sizeof(double*))) ) {
+		fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
+	}
+	if ( !(Uinv_bc_ptr =
+				(double**)SUPERLU_MALLOC(k * sizeof(double*))) ) {
+		fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
+	}
+	Linv_bc_ptr[k-1] = NULL;
+	Uinv_bc_ptr[k-1] = NULL;
+
+	if ( !(Unnz =
+			(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
+	ABORT("Malloc fails for Unnz[].");
+
+
+	/* These lists of processes will be used for triangular solves. */
+	if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for fsendx_plist[].");
+	len = k * grid->nprow;
+	if ( !(index1 = int32Malloc_dist(len)) )
+	    ABORT("Malloc fails for fsendx_plist[0]");
+	for (i = 0; i < len; ++i) index1[i] = SLU_EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    fsendx_plist[i] = &index1[j];
+	if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
+	    ABORT("Malloc fails for bsendx_plist[].");
+	if ( !(index1 = int32Malloc_dist(len)) )
+	    ABORT("Malloc fails for bsendx_plist[0]");
+	for (i = 0; i < len; ++i) index1[i] = SLU_EMPTY;
+	for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
+	    bsendx_plist[i] = &index1[j];
+	/* -------------------------------------------------------------- */
+	mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
+	memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword;  //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
+	/*------------------------------------------------------------
+	  PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
+	  THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
+	  ------------------------------------------------------------*/
+	long int Linv_bc_cnt=0;
+	long int Uinv_bc_cnt=0;
+	long int Lrowind_bc_cnt=0;
+	long int Lnzval_bc_cnt=0;
+	long int Lindval_loc_bc_cnt=0;
+	for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
+	    pc = PCOL( jb, grid );
+	    if ( mycol == pc ) { /* Block column jb in my process column */
+		fsupc = FstBlockC( jb );
+		nsupc = SuperSize( jb );
+		ljb = LBj( jb, grid ); /* Local block number */
+
+		/* Scatter A into SPA. */
+		for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) {
+		    for (i = xa[j]; i < xa[j+1]; ++i) {
+			irow = asub[i];
+			gb = BlockNum( irow );
+			if ( myrow == PROW( gb, grid ) ) {
+			    lb = LBi( gb, grid );
+			    irow = ilsum[lb] + irow - FstBlockC( gb );
+			    dense_col[irow] = a[i];
+			}
+		    }
+		    dense_col += ldaspa;
+		} /* for j ... */
+
+		jbrow = PROW( jb, grid );
+
+		/*------------------------------------------------
+		 * SET UP U BLOCKS.
+		 *------------------------------------------------*/
+#if ( PROFlevel>=1 )
+		t = SuperLU_timer_();
+#endif
+		kseen = 0;
+		dense_col = dense;
+		/* Loop through each column in the block column. */
+		for (j = fsupc; j < FstBlockC( jb+1 ); ++j) {
+		    istart = xusub[j];
+		    /* NOTE: Only the first nonzero index of the segment
+		       is stored in usub[]. */
+		    for (i = istart; i < xusub[j+1]; ++i) {
+			irow = usub[i]; /* First nonzero in the segment. */
+			gb = BlockNum( irow );
+			pr = PROW( gb, grid );
+			if ( pr != jbrow &&
+			     myrow == jbrow &&  /* diag. proc. owning jb */
+			     bsendx_plist[ljb][pr] == SLU_EMPTY ) {
+			    bsendx_plist[ljb][pr] = YES;
+			    ++nbsendx;
                         }
+			if ( myrow == pr) { // YL: added supernode mask here, TODO: double check bmod
+			    if(superGridMap[gb]!= NOT_IN_GRID){
+					lb = LBi( gb, grid ); /* Local block number */
+					index = Ufstnz_br_ptr[lb];
+					uval = Unzval_br_ptr[lb];
+					fsupc1 = FstBlockC( gb+1 );
+					if (rb_marker[lb] <= jb) { /* First time see
+								the block       */
+					rb_marker[lb] = jb + 1;
+					Urb_indptr[lb] = Urb_fstnz[lb];;
+					index[Urb_indptr[lb]] = jb; /* Descriptor */
+					Urb_indptr[lb] += UB_DESCRIPTOR;
+					/* Record the first location in index[] of the
+					next block */
+					Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+					len = Urb_indptr[lb];/* Start fstnz in index */
+					index[len-1] = 0;
+					for (k = 0; k < nsupc; ++k)
+						index[len+k] = fsupc1;
+					if ( gb != jb )/* Exclude diagonal block. */
+						++bmod[lb];/* Mod. count for back solve */
+					if ( kseen == 0 && myrow != jbrow ) {
+						++nbrecvx;
+						kseen = 1;
+					}
+					} else { /* Already saw the block */
+					len = Urb_indptr[lb];/* Start fstnz in index */
+					}
+					jj = j - fsupc;
+					index[len+jj] = irow;
+					/* Load the numerical values */
+					k = fsupc1 - irow; /* No. of nonzeros in segment */
+					index[len-1] += k; /* Increment block length in
+							Descriptor */
+					irow = ilsum[lb] + irow - FstBlockC( gb );
+					for (ii = 0; ii < k; ++ii) {
+					uval[Urb_length[lb]++] = dense_col[irow + ii];
+					dense_col[irow + ii] = zero;
+					}
+				}else{
+					lb = LBi( gb, grid ); /* Local block number */
+					uval = Unzval_br_ptr[lb];
+					fsupc1 = FstBlockC( gb+1 );
+					if (rb_marker[lb] <= jb) { /* First time see
+								the block       */
+					rb_marker[lb] = jb + 1;
+					Urb_indptr[lb] = Urb_fstnz[lb];;
+					Urb_indptr[lb] += UB_DESCRIPTOR;
+					/* Record the first location in index[] of the
+					next block */
+					Urb_fstnz[lb] = Urb_indptr[lb] + nsupc;
+
+					if ( gb != jb )/* Exclude diagonal block. */
+						++bmod[lb];/* Mod. count for back solve */
+					if ( kseen == 0 && myrow != jbrow ) {
+						++nbrecvx;
+						kseen = 1;
+					}
+					}
+				}
+				
+			} /* if myrow == pr ... */
+		    } /* for i ... */
+                    dense_col += ldaspa;
+		} /* for j ... */
+
+#if ( PROFlevel>=1 )
+		t_u += SuperLU_timer_() - t;
+		t = SuperLU_timer_();
+#endif
+		/*------------------------------------------------
+		 * SET UP L BLOCKS.
+		 *------------------------------------------------*/
+
+		/* Count number of blocks and length of each block. */
+		nrbl = 0;
+		len = 0; /* Number of row subscripts I own. */
+		kseen = 0;
+		istart = xlsub[fsupc];
+		for (i = istart; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    gb = BlockNum( irow ); /* Global block number */
+		    pr = PROW( gb, grid ); /* Process row owning this block */
+		    if ( pr != jbrow &&
+			 myrow == jbrow &&  /* diag. proc. owning jb */
+			 fsendx_plist[ljb][pr] == SLU_EMPTY /* first time */ ) {
+			fsendx_plist[ljb][pr] = YES;
+			++nfsendx;
                     }
-                }
-            }
-            SUPERLU_FREE(mod_bit);
-            SUPERLU_FREE(brecv);
-            SUPERLU_FREE(ActiveFlag);
-            SUPERLU_FREE(ActiveFlagAll);
-            SUPERLU_FREE(ranks);
-            SUPERLU_FREE(SeedSTD_RD);
-
-            memTRS -= k * dword + grid->nprow * k * iword; // acount for SeedSTD_RD, ActiveFlagAll
-
-#if (PROFlevel >= 1)
-            t = SuperLU_timer_() - t;
-            if (!iam)
-                printf(".. Construct Reduce tree for U: %.2f\t\n", t);
+		    if ( myrow == pr ) { 
+			lb = LBi( gb, grid );  /* Local block number */
+			if (rb_marker[lb] <= jb) { /* First see this block */
+			    rb_marker[lb] = jb + 1;
+			    Lrb_length[lb] = 1;
+			    Lrb_number[nrbl++] = gb;
+			    if ( gb != jb ) /* Exclude diagonal block. */
+				++fmod[lb]; /* Mod. count for forward solve */
+			    if ( kseen == 0 && myrow != jbrow ) {
+				++nfrecvx;
+				kseen = 1;
+			    }
+#if ( PRNTlevel>=1 )
+			    ++nLblocks;
 #endif
-        }
-        ////////////////////////////////////////////////////////
-
-        Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
-        Llu->Lrowind_bc_dat = Lrowind_bc_dat;
-        Llu->Lrowind_bc_offset = Lrowind_bc_offset;
-        Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
-
-        Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
-        Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
-        Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
-        Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
-
-        Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
-        Llu->Lnzval_bc_dat = Lnzval_bc_dat;
-        Llu->Lnzval_bc_offset = Lnzval_bc_offset;
-        Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
-
-        Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
-        Llu->Ufstnz_br_dat = Ufstnz_br_dat;
-        Llu->Ufstnz_br_offset = Ufstnz_br_offset;
-        Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
-
-        Llu->Unzval_br_ptr = Unzval_br_ptr;
-        Llu->Unzval_br_dat = Unzval_br_dat;
-        Llu->Unzval_br_offset = Unzval_br_offset;
-        Llu->Unzval_br_cnt = Unzval_br_cnt;
-
-        Llu->Unnz = Unnz;
-        Llu->ToRecv = ToRecv;
-        Llu->ToSendD = ToSendD;
-        Llu->ToSendR = ToSendR;
-        Llu->fmod = fmod;
-        Llu->fsendx_plist = fsendx_plist;
-        Llu->nfrecvx = nfrecvx;
-        Llu->nfsendx = nfsendx;
-        Llu->bmod = bmod;
-        Llu->bsendx_plist = bsendx_plist;
-        Llu->nbrecvx = nbrecvx;
-        Llu->nbsendx = nbsendx;
-        Llu->ilsum = ilsum;
-        Llu->ldalsum = ldaspa;
-
-        Llu->LRtree_ptr = LRtree_ptr;
-        Llu->LBtree_ptr = LBtree_ptr;
-        Llu->URtree_ptr = URtree_ptr;
-        Llu->UBtree_ptr = UBtree_ptr;
-
-        Llu->Linv_bc_ptr = Linv_bc_ptr;
-        Llu->Linv_bc_dat = Linv_bc_dat;
-        Llu->Linv_bc_offset = Linv_bc_offset;
-        Llu->Linv_bc_cnt = Linv_bc_cnt;
-
-        Llu->Uinv_bc_ptr = Uinv_bc_ptr;
-        Llu->Uinv_bc_dat = Uinv_bc_dat;
-        Llu->Uinv_bc_offset = Uinv_bc_offset;
-        Llu->Uinv_bc_cnt = Uinv_bc_cnt;
-
-        Llu->Urbs = Urbs;
-        Llu->Ucb_indptr = Ucb_indptr;
-        Llu->Ucb_inddat = Ucb_inddat;
-        Llu->Ucb_indoffset = Ucb_indoffset;
-        Llu->Ucb_indcnt = Ucb_indcnt;
-        Llu->Ucb_valptr = Ucb_valptr;
-        Llu->Ucb_valdat = Ucb_valdat;
-        Llu->Ucb_valoffset = Ucb_valoffset;
-        Llu->Ucb_valcnt = Ucb_valcnt;
-
-#ifdef GPU_ACC
-        if (!grid3d->zscp.Iam && 0)
-        {
-            checkGPU(gpuMalloc((void **)&Llu->d_xsup, (n + 1) * sizeof(int_t)));
-            checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
-            checkGPU(gpuMalloc((void **)&Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
-            checkGPU(gpuMalloc((void **)&Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
-            checkGPU(gpuMalloc((void **)&Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
-            checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
-            checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
-            checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
-            checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
-            checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
-            checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
-            checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
-            checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
-            checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
-
-            // some dummy allocation to avoid checking whether they are null pointers later
-            checkGPU(gpuMalloc((void **)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Unzval_bc_dat, sizeof(double)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
-
-            checkGPU(gpuMalloc((void **)&Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
-            checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
-            checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
-            checkGPU(gpuMalloc((void **)&Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t)));
-            checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
-
-            /* gpuMemcpy for the following is performed in pxgssvx */
-            checkGPU(gpuMalloc((void **)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double)));
-            checkGPU(gpuMalloc((void **)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double)));
-        }
+			} else {
+			    ++Lrb_length[lb];
+			}
+			++len;
+		    }
+		} /* for i ... */
+
+		
+		if ( nrbl) { /* Do not ensure the blocks are sorted! */
+		    if(superGridMap[jb]!= NOT_IN_GRID){ // YL: added supernode mask here
+				/* Set up the initial pointers for each block in
+				index[] and nzval[]. */
+				/* Add room for descriptors */
+				len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+				if ( !(index = intMalloc_dist(len1)) )
+				ABORT("Malloc fails for index[]");
+				if (!(lusup = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
+				ABORT("Malloc fails for lusup[]");
+				if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) )
+				ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
+				myrow = MYROW( iam, grid );
+				krow = PROW( jb, grid );
+				if(myrow==krow){   /* diagonal block */
+					if (!(Linv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
+					ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+					if (!(Uinv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
+					ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+				}else{
+					Linv_bc_ptr[ljb] = NULL;
+					Uinv_bc_ptr[ljb] = NULL;
+				}
+
+				mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+				mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+				mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+				mem_use += len*nsupc*dword + (len1)*iword;
+				memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;  //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+				index[0] = nrbl;  /* Number of row blocks */
+				index[1] = len;   /* LDA of the nzval[] */
+				next_lind = BC_HEADER;
+				next_lval = 0;
+				for (k = 0; k < nrbl; ++k) {
+				gb = Lrb_number[k];
+				lb = LBi( gb, grid );
+				len = Lrb_length[lb];
+				Lindval_loc_bc_ptr[ljb][k] = lb;
+				Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind;
+				Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval;
+				Lrb_length[lb] = 0;  /* Reset vector of block length */
+				index[next_lind++] = gb; /* Descriptor */
+				index[next_lind++] = len;
+				Lrb_indptr[lb] = next_lind;
+				Lrb_valptr[lb] = next_lval;
+				next_lind += len;
+				next_lval += len;
+				}
+				/* Propagate the compressed row subscripts to Lindex[],
+						and the initial values of A from SPA into Lnzval[]. */
+				len = index[1];  /* LDA of lusup[] */
+				for (i = istart; i < xlsub[fsupc+1]; ++i) {
+				irow = lsub[i];
+				gb = BlockNum( irow );
+				if ( myrow == PROW( gb, grid ) ) {
+					lb = LBi( gb, grid );
+					k = Lrb_indptr[lb]++; /* Random access a block */
+					index[k] = irow;
+					k = Lrb_valptr[lb]++;
+					irow = ilsum[lb] + irow - FstBlockC( gb );
+					for (j = 0, dense_col = dense; j < nsupc; ++j) {
+					lusup[k] = dense_col[irow];
+					dense_col[irow] = 0.0;
+					k += len;
+					dense_col += ldaspa;
+					}
+				}
+				} /* for i ... */
+
+				Lrowind_bc_ptr[ljb] = index;
+				Lnzval_bc_ptr[ljb] = lusup;
+
+				/* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb]
+							and Lnzval_bc_ptr[ljb] here.  */
+				if(nrbl>1){
+					krow = PROW( jb, grid );
+					if(myrow==krow){ /* skip the diagonal block */
+						uu=nrbl-2;
+						lloc = &Lindval_loc_bc_ptr[ljb][1];
+					}else{
+						uu=nrbl-1;
+						lloc = Lindval_loc_bc_ptr[ljb];
+					}
+					quickSortM(lloc,0,uu,nrbl,0,3);
+				}
+
+
+				if ( !(index_srt = intMalloc_dist(len1)) )
+					ABORT("Malloc fails for index_srt[]");
+				if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
+					ABORT("Malloc fails for lusup_srt[]");
+
+				idx_indx = BC_HEADER;
+				idx_lusup = 0;
+				for (jj=0;jj<BC_HEADER;jj++)
+					index_srt[jj] = index[jj];
+
+				for(i=0;i<nrbl;i++){
+					nbrow = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+1];
+					for (jj=0;jj<LB_DESCRIPTOR+nbrow;jj++){
+						index_srt[idx_indx++] = index[Lindval_loc_bc_ptr[ljb][i+nrbl]+jj];
+					}
+
+					Lindval_loc_bc_ptr[ljb][i+nrbl] = idx_indx - LB_DESCRIPTOR - nbrow;
+
+					for (jj=0;jj<nbrow;jj++){
+						k=idx_lusup;
+						k1=Lindval_loc_bc_ptr[ljb][i+nrbl*2]+jj;
+						for (j = 0; j < nsupc; ++j) {
+							lusup_srt[k] = lusup[k1];
+							k += len;
+							k1 += len;
+						}
+						idx_lusup++;
+					}
+					Lindval_loc_bc_ptr[ljb][i+nrbl*2] = idx_lusup - nbrow;
+				}
+
+				SUPERLU_FREE(lusup);
+				SUPERLU_FREE(index);
+
+				Lrowind_bc_ptr[ljb] = index_srt;
+				Lnzval_bc_ptr[ljb] = lusup_srt;
+
+			}else{ //if(superGridMap[jb]!= NOT_IN_GRID)
+
+				/* Set up the initial pointers for each block in
+				index[] and nzval[]. */
+				/* Add room for descriptors */
+				len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+
+				myrow = MYROW( iam, grid );
+				krow = PROW( jb, grid );
+
+				mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
+				mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
+				mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
+
+
+				/* YL: need to zero out dense_col even if superGridMap[jb]== NOT_IN_GRID for this column. */
+				for (i = istart; i < xlsub[fsupc+1]; ++i) {
+				irow = lsub[i];
+				gb = BlockNum( irow );
+				if ( myrow == PROW( gb, grid ) ) {
+				    lb = LBi( gb, grid );
+				    irow = ilsum[lb] + irow - FstBlockC( gb );
+				    for (j = 0, dense_col = dense; j < nsupc; ++j) {
+					dense_col[irow] = 0.0;
+					dense_col += ldaspa;
+				    }
+				}
+				} /* for i ... */
+
+				Lrowind_bc_ptr[ljb] = NULL;
+				Lnzval_bc_ptr[ljb] = NULL;
+				Linv_bc_ptr[ljb] = NULL;
+				Uinv_bc_ptr[ljb] = NULL;
+				Lindval_loc_bc_ptr[ljb] = NULL;				
+			}
+		} else {
+		    Lrowind_bc_ptr[ljb] = NULL;
+		    Lnzval_bc_ptr[ljb] = NULL;
+			Linv_bc_ptr[ljb] = NULL;
+			Uinv_bc_ptr[ljb] = NULL;
+			Lindval_loc_bc_ptr[ljb] = NULL;
+		} /* if nrbl ... */
+#if ( PROFlevel>=1 )
+		t_l += SuperLU_timer_() - t;
 #endif
-
-#if (PRNTlevel >= 1)
-        if (!iam)
-            printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
-                   nLblocks, nUblocks);
+	    } /* if mycol == pc */
+
+	} /* for jb ... */
+
+	/////////////////////////////////////////////////////////////////
+
+	/* Set up additional pointers for the index and value arrays of U.
+	   nub is the number of local block columns. */
+	nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */
+	if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) )
+		ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero
+							 blocks in a block column. */
+	Urbs1 = Urbs + nub;
+	if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) )
+		ABORT("Malloc fails for Ucb_indptr[]");
+	if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
+		ABORT("Malloc fails for Ucb_valptr[]");
+
+	mem_use += nub * sizeof(Ucb_indptr_t *) + nub * sizeof(int_t *) + (2*nub)*iword;
+
+
+	nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
+
+	/* Count number of row blocks in a block column.
+	   One pass of the skeleton graph of U. */
+	for (lk = 0; lk < nlb; ++lk) {
+		usub1 = Ufstnz_br_ptr[lk];
+		// YL: no need to supernode mask here ???? 
+		if ( usub1 ) { /* Not an empty block row. */
+			/* usub1[0] -- number of column blocks in this block row. */
+			i = BR_HEADER; /* Pointer in index array. */
+			for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+				k = usub1[i];            /* Global block number */
+				++Urbs[LBj(k,grid)];
+				i += UB_DESCRIPTOR + SuperSize( k );
+			}
+		}
+	}
+
+	/* Set up the vertical linked lists for the row blocks.
+	   One pass of the skeleton graph of U. */
+	for (lb = 0; lb < nub; ++lb) {
+		// YL: no need to add supernode mask here ???? 
+		if ( Urbs[lb] ) { /* Not an empty block column. */
+			if ( !(Ucb_indptr[lb]
+						= SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+				ABORT("Malloc fails for Ucb_indptr[lb][]");
+			if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+				ABORT("Malloc fails for Ucb_valptr[lb][]");
+			mem_use += Urbs[lb] * sizeof(Ucb_indptr_t) + (Urbs[lb])*iword;
+		}else{
+			Ucb_valptr[lb]=NULL;
+			Ucb_indptr[lb]=NULL;
+		}
+	}
+	for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
+		usub1 = Ufstnz_br_ptr[lk];
+		// printf("ID %5d lk %5d usub1 %10d\n",superGridMap[0],lk, usub1);
+		// YL: no need to add supernode mask here ???? 
+		if ( usub1 ) { /* Not an empty block row. */
+			i = BR_HEADER; /* Pointer in index array. */
+			j = 0;         /* Pointer in nzval array. */
+
+			for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */
+				k = usub1[i];          /* Global block number, column-wise. */
+				ljb = LBj( k, grid ); /* Local block number, column-wise. */
+				Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk;
+
+				Ucb_indptr[ljb][Urbs1[ljb]].indpos = i;
+				Ucb_valptr[ljb][Urbs1[ljb]] = j;
+
+				++Urbs1[ljb];
+				j += usub1[i+1];
+				i += UB_DESCRIPTOR + SuperSize( k );
+			}
+		}
+	}
+
+
+/* Count the nnzs per block column */
+	for (lb = 0; lb < nub; ++lb) {
+		Unnz[lb] = 0;
+		k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
+		knsupc = SuperSize( k );
+		// printf("ID %5d lb %5d Urbs[lb] %10d\n",superGridMap[0],lb, Urbs[lb+nub]);
+		for (ub = 0; ub < Urbs[lb]; ++ub) {
+			ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */
+			i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */
+			i += UB_DESCRIPTOR;
+			gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+			iklrow = FstBlockC( gik+1 );
+			for (jj = 0; jj < knsupc; ++jj) {
+				fnz = Ufstnz_br_ptr[ik][i + jj];
+				if ( fnz < iklrow ) {
+					Unnz[lb] +=iklrow-fnz;
+				}
+			} /* for jj ... */
+		}
+	}
+
+	// for (int lb = 0; lb < nub; ++lb) {
+	// 	printf("ID %5d lb %5d, superGridMap[lb] %5d, Unnz[lb] %5d\n",superGridMap[0],lb, superGridMap[lb], Unnz[lb]);
+	// }
+
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->Unnz = Unnz;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+	Llu->fmod = fmod;
+	Llu->fsendx_plist = fsendx_plist;
+	Llu->nfrecvx = nfrecvx;
+	Llu->nfsendx = nfsendx;
+	Llu->bmod = bmod;
+	Llu->bsendx_plist = bsendx_plist;
+	Llu->nbrecvx = nbrecvx;
+	Llu->nbsendx = nbsendx;
+	Llu->ilsum = ilsum;
+	Llu->ldalsum = ldaspa;
+	Llu->Linv_bc_ptr = Linv_bc_ptr;
+	Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+	Llu->Urbs = Urbs;
+	Llu->Ucb_indptr = Ucb_indptr;
+	Llu->Ucb_valptr = Ucb_valptr;
+
+#if ( PRNTlevel>=1 )
+	if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
+			   nLblocks, nUblocks);
 #endif
 
-        SUPERLU_FREE(rb_marker);
-        SUPERLU_FREE(Urb_fstnz);
-        SUPERLU_FREE(Urb_length);
-        SUPERLU_FREE(Urb_indptr);
-        SUPERLU_FREE(Lrb_length);
-        SUPERLU_FREE(Lrb_number);
-        SUPERLU_FREE(Lrb_indptr);
-        SUPERLU_FREE(Lrb_valptr);
-        SUPERLU_FREE(dense);
-
-        /* Find the maximum buffer size. */
-        MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
-                      MPI_MAX, grid->comm);
-
-        k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
-        if (!(Llu->mod_bit = int32Malloc_dist(k)))
-            ABORT("Malloc fails for mod_bit[].");
-
-#if (PROFlevel >= 1)
-        if (!iam)
-            printf(".. 1st distribute time:\n "
-                   "\tL\t%.2f\n\tU\t%.2f\n"
-                   "\tu_blks %d\tnrbu %d\n--------\n",
-                   t_l, t_u, u_blks, nrbu);
+	SUPERLU_FREE(rb_marker);
+	SUPERLU_FREE(Urb_fstnz);
+	SUPERLU_FREE(Urb_length);
+	SUPERLU_FREE(Urb_indptr);
+	SUPERLU_FREE(Lrb_length);
+	SUPERLU_FREE(Lrb_number);
+	SUPERLU_FREE(Lrb_indptr);
+	SUPERLU_FREE(Lrb_valptr);
+	SUPERLU_FREE(dense);
+
+	k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	mem_use -=  (k*8)*iword+ldaspa*sp_ienv_dist(3,options)*dword;
+
+	/* Find the maximum buffer size. */
+	MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t,
+		      MPI_MAX, grid->comm);
+
+	k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+	if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
+	    ABORT("Malloc fails for mod_bit[].");
+
+#if ( PROFlevel>=1 )
+	if ( !iam ) printf(".. 1st distribute time:\n "
+			   "\tL\t%.2f\n\tU\t%.2f\n"
+			   "\tu_blks %d\tnrbu %d\n--------\n",
+  			   t_l, t_u, u_blks, nrbu);
 #endif
 
     } /* else fact != SamePattern_SameRowPerm */
 
-    if (xa[A->ncol] > 0)
-    { /* may not have any entries on this process. */
+    if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */
         SUPERLU_FREE(asub);
         SUPERLU_FREE(a);
     }
     SUPERLU_FREE(xa);
 
-#if (DEBUGlevel >= 1)
+#if ( DEBUGlevel>=1 )
     /* Memory allocated but not freed:
        ilsum, fmod, fsendx_plist, bmod, bsendx_plist  */
-    CHECK_MALLOC(iam, "Exit pddistribute()");
+    CHECK_MALLOC(iam, "Exit pddistribute_allgrid()");
 #endif
 
-    return (mem_use + memTRS);
+    return (mem_use+memTRS);
+
+} /* PDDISTRIBUTE3D_Yang */
 
-} /* PDDISTRIBUTE */
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index cced6e42..92593fb2 100755
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -23,6 +23,10 @@ at the top-level directory.
  */
 #include "superlu_ddefs.h"
 #include "TRF3dV100/superlu_summit.h"
+#include "pddistribute3d.h"
+#include "ssvx3dAux.c"
+int_t dgatherAllFactoredLU3d( dtrf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT );
 #include <stdbool.h>
 /*! \brief
  *
@@ -497,8 +501,6 @@ at the top-level directory.
  * See superlu_ddefs.h for the definitions of varioous data types.
  * </pre>
  */
-// dSOLVEstruct3d_t * SOLVEstruct,
-// SOLVEstruct->A3d
 
 int writeLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct)
 {
@@ -714,10 +716,9 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	SuperMatrix GA; /* Global A in NC format */
 	NCformat *GAstore;
 	double *a_GA;
-	SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
-	NCPformat *GACstore;
+
 	Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-	Glu_freeable_t *Glu_freeable;
+	Glu_freeable_t *Glu_freeable = NULL;
 	/* The nonzero structures of L and U factors, which are
 	   replicated on all processrs.
 	   (lsub, xlsub) contains the compressed subscript of
@@ -778,34 +779,8 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	/* Test the options choices. */
 	*info = 0;
 	Fact = options->Fact;
-	if (Fact < 0 || Fact > FACTORED)
-		*info = -1;
-	else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
-		*info = -1;
-	else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
-		*info = -1;
-	else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
-		*info = -1;
-	else if (options->IterRefine == SLU_EXTRA)
-	{
-		*info = -1;
-		fprintf(stderr,
-				"Extra precise iterative refinement yet to support.");
-	}
-	else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE)
-		*info = -2;
-	else if (ldb < Astore->m_loc)
-		*info = -5;
-	else if (nrhs < 0)
-	{
-		*info = -6;
-	}
-	if (*info)
-	{
-		i = -(*info);
-		pxerr_dist("pdgssvx3d", grid, -(*info));
-		return;
-	}
+
+	validateInput_ssvx3d(options, A, ldb, nrhs, grid3d, info);
 
 	/* Initialization. */
 
@@ -826,7 +801,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	   B is then aliased to B2d for the following 2D solve;
 	*/
 	dGatherNRformat_loc3d_allgrid(Fact, (NRformat_loc *)A->Store,
-						  B, ldb, nrhs, grid3d, &A3d);  
+						  B, ldb, nrhs, grid3d, &A3d);
 
 	B = (double *)A3d->B2d; /* B is now pointing to B2d,
 			   allocated in dGatherNRformat_loc3d.  */
@@ -880,8 +855,13 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 
 		Equil = (!factored && options->Equil == YES);
 		notran = (options->Trans == NOTRANS);
-
 		iam = grid->iam;
+
+	if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */
+	{
+
+		/* The following code now works on 2D grid-0 */
+
 		job = 5;
 		/* Extract equilibration status from a previous factorization */
 		if (factored || (Fact == SamePattern_SameRowPerm && Equil))
@@ -896,154 +876,27 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			rowequ = colequ = FALSE;
 		}
 
+		/* Not factored & ask for equilibration, then alloc RC */
+		if (Equil && Fact != SamePattern_SameRowPerm)
+			dallocScalePermstruct_RC(ScalePermstruct, m, n);
+
 		/* The following arrays are replicated on all processes. */
 		perm_r = ScalePermstruct->perm_r;
 		perm_c = ScalePermstruct->perm_c;
 		etree = LUstruct->etree;
 		R = ScalePermstruct->R;
 		C = ScalePermstruct->C;
-		/********/
-
-		/* Not factored & ask for equilibration */
-		if (Equil && Fact != SamePattern_SameRowPerm)
-		{
-			/* Allocate storage if not done so before. */
-			switch (ScalePermstruct->DiagScale)
-			{
-			case NOEQUIL:
-				if (!(R = (double *)doubleMalloc_dist(m)))
-					ABORT("Malloc fails for R[].");
-				if (!(C = (double *)doubleMalloc_dist(n)))
-					ABORT("Malloc fails for C[].");
-				ScalePermstruct->R = R;
-				ScalePermstruct->C = C;
-				break;
-			case ROW:
-				if (!(C = (double *)doubleMalloc_dist(n)))
-					ABORT("Malloc fails for C[].");
-				ScalePermstruct->C = C;
-				break;
-			case COL:
-				if (!(R = (double *)doubleMalloc_dist(m)))
-					ABORT("Malloc fails for R[].");
-				ScalePermstruct->R = R;
-				break;
-			default:
-				break;
-			}
-		}
 
 		/* ------------------------------------------------------------
 		   Diagonal scaling to equilibrate the matrix.
 		   ------------------------------------------------------------ */
 		if (Equil)
 		{
-#if (DEBUGlevel >= 1)
-			CHECK_MALLOC(iam, "Enter equil");
-#endif
-			t = SuperLU_timer_();
-
-			if (Fact == SamePattern_SameRowPerm)
-			{
-				/* Reuse R and C. */
-				switch (ScalePermstruct->DiagScale)
-				{
-				case NOEQUIL:
-					break;
-				case ROW:
-					irow = fst_row;
-					for (j = 0; j < m_loc; ++j)
-					{
-						for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
-						{
-							a[i] *= R[irow]; /* Scale rows. */
-						}
-						++irow;
-					}
-					break;
-				case COL:
-					for (j = 0; j < m_loc; ++j)
-						for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
-						{
-							icol = colind[i];
-							a[i] *= C[icol]; /* Scale columns. */
-						}
-					break;
-				case BOTH:
-					irow = fst_row;
-					for (j = 0; j < m_loc; ++j)
-					{
-						for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
-						{
-							icol = colind[i];
-							a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */
-						}
-						++irow;
-					}
-					break;
-				}
-			}
-			else
-			{ /* Compute R & C from scratch */
-				/* Compute the row and column scalings. */
-				pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+			scaleMatrixDiagonally(Fact, ScalePermstruct,
+								  A, stat, grid, &rowequ, &colequ, &iinfo);
+			if (iinfo < 0)
+				return; // return if error
 
-				if (iinfo > 0)
-				{
-					if (iinfo <= m)
-					{
-#if (PRNTlevel >= 1)
-						fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
-#endif
-					}
-					else
-					{
-#if (PRNTlevel >= 1)
-						fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo - n);
-#endif
-					}
-				}
-				else if (iinfo < 0)
-					return;
-
-				/* Now iinfo == 0 */
-
-				/* Equilibrate matrix A if it is badly-scaled.
-				   A <-- diag(R)*A*diag(C)                     */
-				pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed);
-
-				if (strncmp(equed, "R", 1) == 0)
-				{
-					ScalePermstruct->DiagScale = ROW;
-					rowequ = ROW;
-				}
-				else if (strncmp(equed, "C", 1) == 0)
-				{
-					ScalePermstruct->DiagScale = COL;
-					colequ = COL;
-				}
-				else if (strncmp(equed, "B", 1) == 0)
-				{
-					ScalePermstruct->DiagScale = BOTH;
-					rowequ = ROW;
-					colequ = COL;
-				}
-				else
-					ScalePermstruct->DiagScale = NOEQUIL;
-
-#if (PRNTlevel >= 1)
-				if (iam == 0)
-				{
-					printf(".. equilibrated? *equed = %c\n", *equed);
-					fflush(stdout);
-				}
-#endif
-			} /* end if-else Fact ... */
-
-			stat->utime[EQUIL] = SuperLU_timer_() - t;
-#if (DEBUGlevel >= 1)
-			CHECK_MALLOC(iam, "Exit equil");
-#endif
 		} /* end if Equil ... LAPACK style, not involving MC64 */
 
 		if (!factored)
@@ -1057,253 +910,28 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			if (Fact != SamePattern_SameRowPerm &&
 				(parSymbFact == NO || options->RowPerm != NO))
 			{
-
-				need_value = (options->RowPerm == LargeDiag_MC64);
-
+				int_t need_value = (options->RowPerm == LargeDiag_MC64);
 				pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
-
 				GAstore = (NCformat *)GA.Store;
-				colptr = GAstore->colptr;
-				rowind = GAstore->rowind;
 				nnz = GAstore->nnz;
-				GA_mem_use = (nnz + n + 1) * sizeof(int_t);
-
-				if (need_value)
-				{
-					a_GA = (double *)GAstore->nzval;
-					GA_mem_use += nnz * sizeof(double);
-				}
-
-				else
+				GA_mem_use = (nnz + n + 1) * sizeof(int_t) + need_value * nnz * sizeof(double);
+				if (!need_value)
 					assert(GAstore->nzval == NULL);
 			}
 
 			/* ------------------------------------------------------------
 			   Find the row permutation for A.
-			   ------------------------------------------------------------ */
-			if (options->RowPerm != NO)
-			{
-				t = SuperLU_timer_();
-				if (Fact != SamePattern_SameRowPerm)
-				{
-					if (options->RowPerm == MY_PERMR)
-					{
-						/* Use user's perm_r. */
-						/* Permute the global matrix GA for symbfact() */
-						for (i = 0; i < colptr[n]; ++i)
-						{
-							irow = rowind[i];
-							rowind[i] = perm_r[irow];
-						}
-					}
-					else if (options->RowPerm == LargeDiag_MC64)
-					{
-						/* Get a new perm_r[] */
-						if (job == 5)
-						{
-							/* Allocate storage for scaling factors. */
-							if (!(R1 = doubleMalloc_dist(m)))
-								ABORT("SUPERLU_MALLOC fails for R1[]");
-							if (!(C1 = doubleMalloc_dist(n)))
-								ABORT("SUPERLU_MALLOC fails for C1[]");
-						}
-
-						if (iam == 0)
-						{
-							/* Process 0 finds a row permutation */
-							iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a_GA,
-												 perm_r, R1, C1);
-							MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm);
-							if (iinfo == 0)
-							{
-								MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm);
-								if (job == 5 && Equil)
-								{
-									MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm);
-									MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm);
-								}
-							}
-						}
-						else
-						{
-							MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm);
-							if (iinfo == 0)
-							{
-								MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm);
-								if (job == 5 && Equil)
-								{
-									MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm);
-									MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm);
-								}
-							}
-						}
-
-						if (iinfo && job == 5)
-						{ /* Error return */
-							SUPERLU_FREE(R1);
-							SUPERLU_FREE(C1);
-						}
-#if (PRNTlevel >= 2)
-						dmin = damch_dist("Overflow");
-						dsum = 0.0;
-						dprod = 1.0;
-#endif
-						if (iinfo == 0)
-						{
-							if (job == 5)
-							{
-								if (Equil)
-								{
-									for (i = 0; i < n; ++i)
-									{
-										R1[i] = exp(R1[i]);
-										C1[i] = exp(C1[i]);
-									}
-
-									/* Scale the distributed matrix further.
-									   A <-- diag(R1)*A*diag(C1)            */
-									irow = fst_row;
-									for (j = 0; j < m_loc; ++j)
-									{
-										for (i = rowptr[j]; i < rowptr[j + 1]; ++i)
-										{
-											icol = colind[i];
-											a[i] *= R1[irow] * C1[icol];
-#if (PRNTlevel >= 2)
-											if (perm_r[irow] == icol)
-											{
-												/* New diagonal */
-												if (job == 2 || job == 3)
-													dmin = SUPERLU_MIN(dmin, fabs(a[i]));
-												else if (job == 4)
-													dsum += fabs(a[i]);
-												else if (job == 5)
-													dprod *= fabs(a[i]);
-											}
-#endif
-										}
-										++irow;
-									}
-
-									/* Multiply together the scaling factors --
-									   R/C from simple scheme, R1/C1 from MC64. */
-									if (rowequ)
-										for (i = 0; i < m; ++i)
-											R[i] *= R1[i];
-									else
-										for (i = 0; i < m; ++i)
-											R[i] = R1[i];
-									if (colequ)
-										for (i = 0; i < n; ++i)
-											C[i] *= C1[i];
-									else
-										for (i = 0; i < n; ++i)
-											C[i] = C1[i];
-
-									ScalePermstruct->DiagScale = BOTH;
-									rowequ = colequ = 1;
-
-								} /* end if Equil */
-
-								/* Now permute global A to prepare for symbfact() */
-								for (j = 0; j < n; ++j)
-								{
-									for (i = colptr[j]; i < colptr[j + 1]; ++i)
-									{
-										irow = rowind[i];
-										rowind[i] = perm_r[irow];
-									}
-								}
-								SUPERLU_FREE(R1);
-								SUPERLU_FREE(C1);
-							}
-							else
-							{ /* job = 2,3,4 */
-								for (j = 0; j < n; ++j)
-								{
-									for (i = colptr[j]; i < colptr[j + 1]; ++i)
-									{
-										irow = rowind[i];
-										rowind[i] = perm_r[irow];
-									} /* end for i ... */
-								}	  /* end for j ... */
-							}		  /* end else job ... */
-						}
-						else
-						{ /* if iinfo != 0 */
-							for (i = 0; i < m; ++i)
-								perm_r[i] = i;
-						}
-#if (PRNTlevel >= 2)
-						if (job == 2 || job == 3)
-						{
-							if (!iam)
-								printf("\tsmallest diagonal %e\n", dmin);
-						}
-						else if (job == 4)
-						{
-							if (!iam)
-								printf("\tsum of diagonal %e\n", dsum);
-						}
-						else if (job == 5)
-						{
-							if (!iam)
-								printf("\t product of diagonal %e\n", dprod);
-						}
-#endif
-					}
-					else
-					{ /* use LargeDiag_HWPM */
-#ifdef HAVE_COMBBLAS
-						d_c2cpp_GetHWPM(A, grid, ScalePermstruct);
-#else
-						if (iam == 0)
-						{
-							printf("CombBLAS is not available\n");
-							fflush(stdout);
-						}
-#endif
-					} /* end if-else options->RowPerm ... */
-
-					t = SuperLU_timer_() - t;
-					stat->utime[ROWPERM] = t;
-#if (PRNTlevel >= 1)
-					if (!iam)
-					{
-						printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t);
-						fflush(stdout);
-					}
-#endif
-				} /* end if Fact not SamePattern_SameRowPerm ... */
-			}
-			else
-			{ /* options->RowPerm == NOROWPERM / NATURAL */
-				for (i = 0; i < m; ++i)
-					perm_r[i] = i;
-			}
+			------------------------------------------------------------ */
+			perform_row_permutation(
+				options, Fact, ScalePermstruct, LUstruct,
+				m, n, grid, A, &GA, stat, job, Equil,
+				&rowequ, &colequ, &iinfo);
 
-#if (DEBUGlevel >= 2)
-			if (!iam)
-				PrintInt10("perm_r", m, perm_r);
-#endif
 		} /* end if (!factored) */
 
+		/* Compute norm(A), which will be used to adjust small diagonal. */
 		if (!factored || options->IterRefine)
-		{
-			/* Compute norm(A), which will be used to adjust small diagonal. */
-			if (notran)
-				*(unsigned char *)norm = '1';
-			else
-				*(unsigned char *)norm = 'I';
-			anorm = pdlangs(norm, A, grid);
-#if (PRNTlevel >= 1)
-			if (!iam)
-			{
-				printf(".. anorm %e\n", anorm);
-				fflush(stdout);
-			}
-#endif
-		}
+			anorm = computeA_Norm(notran, A, grid);
 
 		/* ------------------------------------------------------------
 		   Perform ordering and symbolic factorization
@@ -1324,6 +952,10 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 
 			if (parSymbFact == YES || permc_spec == PARMETIS)
 			{
+				if(grid3d->npdep!=1){
+					fprintf(stderr, "Error: ParMETIS and Parallel Symbolic Factorization are not yet supported with grid3d->npdep>1.\n");
+					return; // or exit(-1); if you want to terminate the program
+				}
 				nprocs_num = grid->nprow * grid->npcol;
 				noDomains = (int)(pow(2, ((int)LOG2(nprocs_num))));
 
@@ -1395,77 +1027,19 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			{
 				if (parSymbFact == NO)
 				{
-
-					int_t *GACcolbeg, *GACcolend, *GACrowind;
-
-					sp_colorder(options, &GA, perm_c, etree, &GAC);
-
-					/* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
-					GACstore = (NCPformat *)GAC.Store;
-					GACcolbeg = GACstore->colbeg;
-					GACcolend = GACstore->colend;
-					GACrowind = GACstore->rowind;
-					for (j = 0; j < n; ++j)
-					{
-						for (i = GACcolbeg[j]; i < GACcolend[j]; ++i)
-						{
-							irow = GACrowind[i];
-							GACrowind[i] = perm_c[irow];
-						}
-					}
-
-					/* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
-					   the nonzero data structures for L & U. */
-#if (PRNTlevel >= 1)
-					if (!iam)
-						printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
-							   sp_ienv_dist(2, options), sp_ienv_dist(3, options), sp_ienv_dist(6, options));
-#endif
-					t = SuperLU_timer_();
-					if (!(Glu_freeable = (Glu_freeable_t *)
-							  SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
-						ABORT("Malloc fails for Glu_freeable.");
-
-					/* Every process does this. */
-					iinfo = symbfact(options, iam, &GAC, perm_c, etree,
-									 Glu_persist, Glu_freeable);
-
-					stat->utime[SYMBFAC] = SuperLU_timer_() - t;
-					if (iinfo < 0)
-					{
-						/* Successful return */
-						QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
-
-#if (PRNTlevel >= 1)
-						if (!iam)
-						{
-							printf("\tNo of supers %ld\n",
-								   (long)Glu_persist->supno[n - 1] + 1);
-							printf("\tSize of G(L) %ld\n", (long)Glu_freeable->xlsub[n]);
-							printf("\tSize of G(U) %ld\n", (long)Glu_freeable->xusub[n]);
-							printf("\tint %lu, short %lu, float %lu, double %lu\n",
-								   sizeof(int_t), sizeof(short),
-								   sizeof(float), sizeof(double));
-							printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
-								   symb_mem_usage.for_lu * 1e-6,
-								   symb_mem_usage.total * 1e-6,
-								   symb_mem_usage.expansions);
-						}
-#endif
-					}
-					else
-					{
-						if (!iam)
-						{
-							fprintf(stderr, "symbfact() error returns %d\n",
-									(int)iinfo);
-							exit(-1);
-						}
-					}
+				/*Allocating Glu_freeable used by symbfact */
+				if (!(Glu_freeable = (Glu_freeable_t *)
+						  SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
+					ABORT("Malloc fails for Glu_freeable.");
+				permCol_SymbolicFact3d(options, n, &GA, perm_c, etree,
+									   Glu_persist, Glu_freeable, stat,
+									   &symb_mem_usage,
+									   grid3d);
 
 				} /* end serial symbolic factorization */
 				else
 				{ /* parallel symbolic factorization */
+					//TODO: need a 3D version of symbfact_dist
 					t = SuperLU_timer_();
 					flinfo =
 						symbfact_dist(options, nprocs_num, noDomains,
@@ -1481,120 +1055,127 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 				/* Destroy GA */
 				if (parSymbFact == NO || options->RowPerm != NO)
 					Destroy_CompCol_Matrix_dist(&GA);
-				if (parSymbFact == NO)
-					Destroy_CompCol_Permuted_dist(&GAC);
 
 			} /* end if Fact not SamePattern_SameRowPerm */
+		} /* end if not Factored */
+	} /* end 2D process layer 0 */
+	
+	MPI_Bcast(&rowequ, 1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Bcast(&colequ, 1, mpi_int_t, 0, grid3d->zscp.comm);
 
-#if (DEBUGlevel >= 2) // Sherry
-			if (!iam)
-				PrintInt10("perm_c", m, perm_c);
-#endif
-			if (sizes)
-				SUPERLU_FREE(sizes);
-			if (fstVtxSep)
-				SUPERLU_FREE(fstVtxSep);
-			if (symb_comm != MPI_COMM_NULL)
-				MPI_Comm_free(&symb_comm);
-
-			if (parSymbFact == NO || Fact == SamePattern_SameRowPerm)
+	/* Broadcast Permuted A and symbolic factorization data from 2d to 3d grid*/
+	if (Fact != SamePattern_SameRowPerm && !factored) // place the exact conditions later //all the grid must execute this
+	{
+		if (parSymbFact == NO){
+			if (Glu_freeable == NULL)
 			{
-				/* Apply column permutation to the original distributed A */
-				for (j = 0; j < nnz_loc; ++j)
-					colind[j] = perm_c[colind[j]];
-
-				/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
-				   NOTE: the row permutation Pc*Pr is applied internally in the
-				   distribution routine. */
-				t = SuperLU_timer_();
-
-				nsupers = getNsupers(n, LUstruct->Glu_persist);
-				int* supernodeMask;
-				if(Fact == SamePattern_SameRowPerm){
-					supernodeMask=trf3Dpartition->supernodeMask;
-					dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct,
-												Glu_freeable, LUstruct, grid, supernodeMask);					
-						
-				}else{
-
-					// First call of pddistribute_allgrid with a prefixed supernodeMask
-					// YL: this first call can be removed with Piyush's cleaner fix 
-					int* supernodeMask = int32Calloc_dist(nsupers);
-					for (int i=0;i<nsupers;i++){
-						if(grid3d->zscp.Iam == i%grid3d->npdep)
-							supernodeMask[i]=1;
-					}
-					dist_mem_use = pddistribute_allgrid_index_only(options, n, A, ScalePermstruct,
-											Glu_freeable, LUstruct, grid, supernodeMask);
-					SUPERLU_FREE(supernodeMask);
+				if (!(Glu_freeable = (Glu_freeable_t *)
+						SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
+					ABORT("Malloc fails for Glu_freeable.");
+			}
+			bcastPermutedSparseA(A,
+								ScalePermstruct,
+								Glu_freeable,
+								LUstruct, grid3d);
+		}else{
+			//TODO: need a parmetis version of bcastPermutedSparseA broadcasting Pslu_freeable
+		}
+	}
 
-					// Generate the 3D partition
-					dDestroy_trf3Dpartition(LUstruct->trf3Dpart);
-					trf3Dpartition = dinitTrf3Dpartition_allgrid(n, options, LUstruct, grid3d);
-					LUstruct->trf3Dpart=trf3Dpartition;
-
-					// Delete the meta data generated by pddistribute_allgrid
-					dLocalLU_t *Llu = LUstruct->Llu;
-					for (int jb = 0; jb < CEILING( nsupers, grid->npcol ); ++jb) { /* for each block column ... */						
-						if ( Llu->Lrowind_bc_ptr[jb] ) {
-							SUPERLU_FREE (Llu->Lrowind_bc_ptr[jb]);
-						}													
-					}
-					SUPERLU_FREE (Llu->Lrowind_bc_ptr);
-					for (int lb = 0; lb < CEILING( nsupers, grid->nprow ); ++lb) { /* for each block row ... */
-						if(Llu->Ufstnz_br_ptr[lb]!=NULL)
-							SUPERLU_FREE(Llu->Ufstnz_br_ptr[lb]);															
-					}
-					SUPERLU_FREE(Llu->Ufstnz_br_ptr);
+	perm_r = ScalePermstruct->perm_r;
+	perm_c = ScalePermstruct->perm_c;
+	etree = LUstruct->etree;
+	R = ScalePermstruct->R;
+	C = ScalePermstruct->C;
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+	Astore = (NRformat_loc *)A->Store;
+	a = (double *)Astore->nzval;
+	rowptr = Astore->rowptr;
+	colind = Astore->colind;
+	Glu_persist = LUstruct->Glu_persist;
 	
+	// perform the  3D distribution 
+	if (!factored)
+	{
+		/* Apply column permutation to the original distributed A */
+		for (j = 0; j < nnz_loc; ++j)
+			colind[j] = perm_c[colind[j]];
+		// free quauntities used in Parmetis
+		if (sizes)
+			SUPERLU_FREE(sizes);
+		if (fstVtxSep)
+			SUPERLU_FREE(fstVtxSep);
+		if (symb_comm != MPI_COMM_NULL)
+			MPI_Comm_free(&symb_comm);
+		if ( Fact != SamePattern_SameRowPerm){
+			LUstruct->trf3Dpart = SUPERLU_MALLOC(sizeof(dtrf3Dpartition_t));
+			// computes the new partition for 3D factorization here
+			trf3Dpartition=LUstruct->trf3Dpart;
+			newTrfPartitionInit(nsupers, LUstruct, grid3d);
+		}
+	}
 
-					// Second call of pddistribute_allgrid with the final supernodeMask   						
-					dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct,
-											Glu_freeable, LUstruct, grid, trf3Dpartition->supernodeMask);
 
 
-					/* now that LU structure has been scattered, initialize the LU and buffers */
-					dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs,
-										trf3Dpartition->sForests, LUstruct, grid3d);	
-					dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
-					dLluBufInit(LUvsb, LUstruct);
-					trf3Dpartition->LUvsb = LUvsb;
-				}
+	// perform the  3D distribution 
+	if (!factored)
+	{ /* Skip this if already factored. */
 
+		if (parSymbFact == NO || Fact == SamePattern_SameRowPerm)
+		{
 
-				stat->utime[DIST] = SuperLU_timer_() - t;
 
-				/* Deallocate storage used in symbolic factorization. */
-				if (Fact != SamePattern_SameRowPerm)
-				{
-					iinfo = symbfact_SubFree(Glu_freeable);
-					SUPERLU_FREE(Glu_freeable);
-				}
+			/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+				NOTE: the row permutation Pc*Pr is applied internally in the
+				distribution routine. */
+			t = SuperLU_timer_();
 
-			}
-			else
+			dist_mem_use = pddistribute3d_Yang(options, n, A, ScalePermstruct,
+											Glu_freeable, LUstruct, grid3d);
+			stat->utime[DIST] = SuperLU_timer_() - t;
+
+			/* Deallocate storage used in symbolic factorization. */
+			if (Fact != SamePattern_SameRowPerm)
 			{
-				/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
-				   NOTE: the row permutation Pc*Pr is applied internally in the
-				   distribution routine. */
-				/* Apply column permutation to the original distributed A */
-				for (j = 0; j < nnz_loc; ++j)
-					colind[j] = perm_c[colind[j]];
+				iinfo = symbfact_SubFree(Glu_freeable);
+				SUPERLU_FREE(Glu_freeable);
+			}
+		}
+		else
+		{
+			/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+				NOTE: the row permutation Pc*Pr is applied internally in the
+				distribution routine. */
+
+			// TODO: need a 3D version of ddist_psymbtonum
+			t = SuperLU_timer_();
+			dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct,
+											&Pslu_freeable, LUstruct, grid);
+			if (dist_mem_use > 0)
+				ABORT("Not enough memory available for dist_psymbtonum\n");
 
-				t = SuperLU_timer_();
-				dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct,
-												&Pslu_freeable, LUstruct, grid);
-				if (dist_mem_use > 0)
-					ABORT("Not enough memory available for dist_psymbtonum\n");
+			stat->utime[DIST] = SuperLU_timer_() - t;
 
-				stat->utime[DIST] = SuperLU_timer_() - t;
+			ABORT("ddist_psymbtonum does not yet work with 3D factorization\n");
 
-				ABORT("ddist_psymbtonum does not yet work with 3D factorization\n");
+		}
 
-			}
+		if(Fact != SamePattern_SameRowPerm){
+			// checkDist3DLUStruct(LUstruct, grid3d);
+			// zeros out the Supernodes that are not owned by the grid
+			dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs,
+									trf3Dpartition->sForests, LUstruct, grid3d);
+
+			dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+			dLluBufInit(LUvsb, LUstruct);
+			trf3Dpartition->LUvsb = LUvsb;
+			trf3Dpartition->iperm_c_supno = create_iperm_c_supno(nsupers, options, LUstruct, grid3d);
+		}
 
-			/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
 
+		/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
+		
+		MPI_Bcast(&anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm);
 
 		/* Perform numerical factorization in parallel on all process layers.*/
 
@@ -1721,7 +1302,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		if (writeLU)
 		{
 			if (!grid3d->zscp.Iam)
-				writeLUtoDisk(nsupers, Glu_persist->xsup, LUstruct);
+				writeLUtoDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct);
 		}
 
 		int checkLU = 0;
@@ -1733,7 +1314,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		if (checkLU)
 		{
 			if (!grid3d->zscp.Iam)
-				checkLUFromDisk(nsupers, Glu_persist->xsup, LUstruct);
+				checkLUFromDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct);
 		}
 
 #if (PRNTlevel >= 0)
@@ -2218,7 +1799,7 @@ if (getenv("SUPERLU_ACC_SOLVE")){
 						SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
 						ABORT ("Malloc fails for gstrs_comm[]");
 					pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
-							Glu_persist, SOLVEstruct1);
+							LUstruct->Glu_persist, SOLVEstruct1);
 					if (getenv("SUPERLU_ACC_SOLVE")){
 					int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 					pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,trf3Dpartition->supernodeMask);		 
@@ -2388,7 +1969,7 @@ if (getenv("SUPERLU_ACC_SOLVE")){
 						SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
 						ABORT ("Malloc fails for gstrs_comm[]");
 					pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
-							Glu_persist, SOLVEstruct1);
+							LUstruct->Glu_persist, SOLVEstruct1);
 					if (getenv("SUPERLU_ACC_SOLVE")){
 					int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 					int* supernodeMask = int32Malloc_dist(nsupers);
diff --git a/SRC/pdgssvx3d_1pass_Yang.c b/SRC/pdgssvx3d_1pass_Yang.c
new file mode 100755
index 00000000..a3cea62b
--- /dev/null
+++ b/SRC/pdgssvx3d_1pass_Yang.c
@@ -0,0 +1,2116 @@
+
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.2) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021
+ * Last update: November 8, 2021  v7.2.0
+ */
+#include "superlu_ddefs.h"
+#include "TRF3dV100/superlu_summit.h"
+#include "pddistribute3d.h"
+#include "ssvx3dAux.c"
+int_t dgatherAllFactoredLU3d( dtrf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT );
+#include <stdbool.h>
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (dLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util_dist.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * </pre>
+ */
+
+int writeLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct)
+{
+
+	if (getenv("LUFILE"))
+	{
+		FILE *fp = fopen(getenv("LUFILE"), "w");
+		printf("writing to %s", getenv("LUFILE"));
+		for (int i = 0; i < nsupers; i++)
+		{
+			if (LUstruct->Llu->Lrowind_bc_ptr[i])
+			{
+				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
+				double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
+
+				int_t len = lsub[1]; /* LDA of the nzval[] */
+				int_t len2 = SuperSize(i) * len;
+				fwrite(nzval, sizeof(double), len2, fp); // assume fp will be incremented
+			}
+
+			if (LUstruct->Llu->Ufstnz_br_ptr[i])
+			{
+				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
+				double *nzval = LUstruct->Llu->Unzval_br_ptr[i];
+				int_t lenv = usub[1];
+
+				fwrite(nzval, sizeof(double), lenv, fp); // assume fp will be incremented
+			}
+		}
+
+		fclose(fp);
+	}
+	else
+	{
+		printf("Please set environment variable LUFILE to write\n..bye bye");
+		exit(0);
+	}
+	
+	return 0;
+}
+
+#define EPSILON 1e-3
+
+static int checkArr(double *A, double *B, int n)
+{
+	for (int i = 0; i < n; i++)
+	{
+		assert(fabs(A[i] - B[i]) <= EPSILON * SUPERLU_MIN(fabs(A[i]), fabs(B[i])));
+	}
+
+	return 0;
+}
+
+int checkLUFromDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct)
+{
+	dLocalLU_t *Llu = LUstruct->Llu;
+
+	double *Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]);
+	double *Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]);
+
+	if (getenv("LUFILE"))
+	{
+		FILE *fp = fopen(getenv("LUFILE"), "r");
+		printf("reading from %s", getenv("LUFILE"));
+		for (int i = 0; i < nsupers; i++)
+		{
+			if (LUstruct->Llu->Lrowind_bc_ptr[i])
+			{
+				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
+				double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
+
+				int_t len = lsub[1]; /* LDA of the nzval[] */
+				int_t len2 = SuperSize(i) * len;
+				fread(Lval_buf, sizeof(double), len2, fp); // assume fp will be incremented
+				checkArr(nzval, Lval_buf, len2);
+			}
+
+			if (LUstruct->Llu->Ufstnz_br_ptr[i])
+			{
+				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
+				double *nzval = LUstruct->Llu->Unzval_br_ptr[i];
+				int_t lenv = usub[1];
+
+				fread(Uval_buf, sizeof(double), lenv, fp); // assume fp will be incremented
+				checkArr(nzval, Uval_buf, lenv);
+			}
+		}
+		printf("CHecking LU from  %s is succesful ", getenv("LUFILE"));
+		fclose(fp);
+	}
+	else
+	{
+		printf("Please set environment variable LUFILE to read\n..bye bye");
+		exit(0);
+	}
+
+	return 0;
+}
+
+
+/*! \brief Dump the factored matrix L using matlab triple-let format
+ */
+void dDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d,
+		  Glu_persist_t *Glu_persist, dLocalLU_t *Llu)
+{
+    register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    double *nzval;
+	char filename[256];
+	FILE *fp, *fopen();
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int iam = grid->iam;
+	int iam3d = grid3d->iam;
+
+	// assert(grid->npcol*grid->nprow==1);
+
+	// count nonzeros in the first pass
+	nnzL = 0;
+	n = 0;
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+
+		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
+			nnzL ++;
+			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
+			n = nmax;
+		}
+
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
+	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
+
+	snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d);
+    printf("Dumping L factor to --> %s\n", filename);
+ 	if ( !(fp = fopen(filename, "w")) ) {
+			ABORT("File open failed");
+		}
+
+	if(grid->iam==0){
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
+	}
+
+     ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr]);
+#if 0
+			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr]);
+#endif
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+ 	fclose(fp);
+
+} /* dDumpLblocks3D */
+
+
+
+
+
+
+
+
+void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
+			   dScalePermstruct_t *ScalePermstruct,
+			   double B[], int ldb, int nrhs, gridinfo3d_t *grid3d,
+			   dLUstruct_t *LUstruct, dSOLVEstruct_t *SOLVEstruct,
+			   double *berr, SuperLUStat_t *stat, int *info)
+{
+	NRformat_loc *Astore = A->Store;
+	SuperMatrix GA; /* Global A in NC format */
+	NCformat *GAstore;
+	double *a_GA;
+	SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
+	NCPformat *GACstore;
+	Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+	Glu_freeable_t *Glu_freeable = NULL;
+	/* The nonzero structures of L and U factors, which are
+	   replicated on all processrs.
+	   (lsub, xlsub) contains the compressed subscript of
+	   supernodes in L.
+	   (usub, xusub) contains the compressed subscript of
+	   nonzero segments in U.
+	   If options->Fact != SamePattern_SameRowPerm, they are
+	   computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+	   routine. They will be freed after PDDISTRIBUTE routine.
+	   If options->Fact == SamePattern_SameRowPerm, these
+	   structures are not used.                                  */
+	yes_no_t parSymbFact = options->ParSymbFact;
+	fact_t Fact;
+	double *a;
+	int_t *colptr, *rowind;
+	int_t *perm_r;			/* row permutations from partial pivoting */
+	int_t *perm_c;			/* column permutation vector */
+	int_t *etree;			/* elimination tree */
+	int_t *rowptr, *colind; /* Local A in NR */
+	int_t colequ, Equil, factored, job, notran, rowequ, need_value;
+	int_t i, iinfo, j, irow, m, n, nnz, permc_spec;
+	int_t nnz_loc, m_loc, fst_row, icol;
+	int iam;
+	int ldx; /* LDA for matrix X (local). */
+	char equed[1], norm[1];
+	double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+	double *X, *b_col, *b_work, *x_col;
+	double t;
+	float GA_mem_use;	/* memory usage by global A */
+	float dist_mem_use; /* memory usage during distribution */
+	superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+	float flinfo; /* track memory usage of parallel symbolic factorization */
+	bool Solve3D = true;
+	int_t nsupers;
+#if (PRNTlevel >= 2)
+	double dmin, dsum, dprod;
+#endif
+
+	dtrf3Dpartition_t *trf3Dpartition=LUstruct->trf3Dpart;
+	int gpu3dVersion = 0;
+	#ifdef GPU_ACC
+		// gpu3dVersion = 1;
+	if (getenv("GPU3DVERSION"))
+	{
+		gpu3dVersion = atoi(getenv("GPU3DVERSION"));
+	}
+
+	LUgpu_Handle LUgpu;
+	#endif 
+
+
+	LUstruct->dt = 'd';
+
+	// get the 2d grid
+	gridinfo_t *grid = &(grid3d->grid2d);
+	iam = grid->iam;
+
+	/* Test the options choices. */
+	*info = 0;
+	Fact = options->Fact;
+	validateInput_ssvx3d(options, A, ldb, nrhs, grid3d, info);
+
+	/* Initialization. */
+
+	options->Algo3d = YES;
+
+	/* definition of factored seen by each process layer */
+	factored = (Fact == FACTORED);
+
+	/* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+	   so that the names {ldb, B, and Astore} can be used internally.
+	   B3d and Astore3d will be assigned back to B and Astore on return.*/
+	int ldb3d = ldb;
+	NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+	NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+	/* B3d is aliased to B;
+	   B2d is allocated;
+	   B is then aliased to B2d for the following 2D solve;
+	*/
+	dGatherNRformat_loc3d_allgrid(Fact, (NRformat_loc *)A->Store,
+						  B, ldb, nrhs, grid3d, &A3d);  
+
+	B = (double *)A3d->B2d; /* B is now pointing to B2d,
+			   allocated in dGatherNRformat_loc3d.  */
+	// PrintDouble5("after gather B=B2d", ldb, B);
+
+	SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+				   multiple calls of pdgssvx3d()   */
+
+	NRformat_loc *Astore0 = A3d->A_nfmt; // on all grids
+	NRformat_loc *A_orig = A->Store;
+	//////
+
+#if (DEBUGlevel >= 1)
+	CHECK_MALLOC(iam, "Enter pdgssvx3d()");
+#endif
+
+	/* Perform preprocessing steps on process layer zero, including:
+	   gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps:
+	   - equilibration,
+	   - ordering,
+	   - symbolic factorization,
+	   - distribution of L & U                                      */
+
+		m = A->nrow;
+		n = A->ncol;
+		// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
+
+		// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
+		A->Store = Astore0; // on all grids
+		ldb = Astore0->m_loc;
+
+		/* The following code now works on all grids */
+		Astore = (NRformat_loc *)A->Store;
+		nnz_loc = Astore->nnz_loc;
+		m_loc = Astore->m_loc;
+		fst_row = Astore->fst_row;
+		a = (double *)Astore->nzval;
+		rowptr = Astore->rowptr;
+		colind = Astore->colind;
+
+		/* Structures needed for parallel symbolic factorization */
+		int_t *sizes, *fstVtxSep;
+		int noDomains, nprocs_num;
+		MPI_Comm symb_comm; /* communicator for symbolic factorization */
+		int col, key;		/* parameters for creating a new communicator */
+		Pslu_freeable_t Pslu_freeable;
+
+		sizes = NULL;
+		fstVtxSep = NULL;
+		symb_comm = MPI_COMM_NULL;
+
+		Equil = (!factored && options->Equil == YES);
+		notran = (options->Trans == NOTRANS);
+
+		iam = grid->iam;
+		job = 5;
+		/* Extract equilibration status from a previous factorization */
+		if (factored || (Fact == SamePattern_SameRowPerm && Equil))
+		{
+			rowequ = (ScalePermstruct->DiagScale == ROW) ||
+					 (ScalePermstruct->DiagScale == BOTH);
+			colequ = (ScalePermstruct->DiagScale == COL) ||
+					 (ScalePermstruct->DiagScale == BOTH);
+		}
+		else
+		{
+			rowequ = colequ = FALSE;
+		}
+
+		/* Not factored & ask for equilibration, then alloc RC */
+		if (Equil && Fact != SamePattern_SameRowPerm)
+			dallocScalePermstruct_RC(ScalePermstruct, m, n);
+
+		/* The following arrays are replicated on all processes. */
+		perm_r = ScalePermstruct->perm_r;
+		perm_c = ScalePermstruct->perm_c;
+		etree = LUstruct->etree;
+		R = ScalePermstruct->R;
+		C = ScalePermstruct->C;
+
+		/* ------------------------------------------------------------
+		   Diagonal scaling to equilibrate the matrix.
+		   ------------------------------------------------------------ */
+		if (Equil)
+		{
+			scaleMatrixDiagonally(Fact, ScalePermstruct,
+								  A, stat, grid, &rowequ, &colequ, &iinfo);
+			if (iinfo < 0)
+				return; // return if error
+
+		} /* end if Equil ... LAPACK style, not involving MC64 */
+
+		if (!factored)
+		{ /* Skip this if already factored. */
+			/*
+			 * Gather A from the distributed compressed row format to
+			 * global A in compressed column format.
+			 * Numerical values are gathered only when a row permutation
+			 * for large diagonal is sought after.
+			 */
+			if (Fact != SamePattern_SameRowPerm &&
+				(parSymbFact == NO || options->RowPerm != NO))
+			{
+
+				need_value = (options->RowPerm == LargeDiag_MC64);
+
+				pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
+
+				GAstore = (NCformat *)GA.Store;
+				colptr = GAstore->colptr;
+				rowind = GAstore->rowind;
+				nnz = GAstore->nnz;
+				GA_mem_use = (nnz + n + 1) * sizeof(int_t);
+
+				if (need_value)
+				{
+					a_GA = (double *)GAstore->nzval;
+					GA_mem_use += nnz * sizeof(double);
+				}
+
+				else
+					assert(GAstore->nzval == NULL);
+			}
+
+			/* ------------------------------------------------------------
+			   Find the row permutation for A.
+			------------------------------------------------------------ */
+			perform_row_permutation(
+				options, Fact, ScalePermstruct, LUstruct,
+				m, n, grid, A, &GA, stat, job, Equil,
+				&rowequ, &colequ, &iinfo);
+
+		} /* end if (!factored) */
+
+		/* Compute norm(A), which will be used to adjust small diagonal. */
+		if (!factored || options->IterRefine)
+			anorm = computeA_Norm(notran, A, grid);
+
+		/* ------------------------------------------------------------
+		   Perform ordering and symbolic factorization
+		   ------------------------------------------------------------ */
+		if (!factored)
+		{
+			t = SuperLU_timer_();
+			/*
+			 * Get column permutation vector perm_c[], according to permc_spec:
+			 *   permc_spec = NATURAL:  natural ordering
+			 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+			 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+			 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+			 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+			 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+			 */
+			permc_spec = options->ColPerm;
+
+			if (parSymbFact == YES || permc_spec == PARMETIS)
+			{
+				nprocs_num = grid->nprow * grid->npcol;
+				noDomains = (int)(pow(2, ((int)LOG2(nprocs_num))));
+
+				/* create a new communicator for the first noDomains
+				   processes in grid->comm */
+				key = iam;
+				if (iam < noDomains)
+					col = 0;
+				else
+					col = MPI_UNDEFINED;
+				MPI_Comm_split(grid->comm, col, key, &symb_comm);
+
+				if (permc_spec == NATURAL || permc_spec == MY_PERMC)
+				{
+					if (permc_spec == NATURAL)
+					{
+						for (j = 0; j < n; ++j)
+							perm_c[j] = j;
+					}
+					if (!(sizes = intMalloc_dist(2 * noDomains)))
+						ABORT("SUPERLU_MALLOC fails for sizes.");
+					if (!(fstVtxSep = intMalloc_dist(2 * noDomains)))
+						ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
+					for (i = 0; i < 2 * noDomains - 2; ++i)
+					{
+						sizes[i] = 0;
+						fstVtxSep[i] = 0;
+					}
+					sizes[2 * noDomains - 2] = m;
+					fstVtxSep[2 * noDomains - 2] = 0;
+				}
+				else if (permc_spec != PARMETIS)
+				{
+					/* same as before */
+					printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n",
+						   (int)MYROW(grid->iam, grid), (int)MYCOL(grid->iam, grid));
+				}
+			} /* end ... use parmetis */
+
+			
+			if (permc_spec != MY_PERMC && Fact == DOFACT)
+			{
+				if (permc_spec == PARMETIS)
+				{
+					/* Get column permutation vector in perm_c.                   *
+					 * This routine takes as input the distributed input matrix A *
+					 * and does not modify it.  It also allocates memory for      *
+					 * sizes[] and fstVtxSep[] arrays, that contain information   *
+					 * on the separator tree computed by ParMETIS.                */
+					flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
+												 noDomains, &sizes, &fstVtxSep,
+												 grid, &symb_comm);
+					if (flinfo > 0)
+						ABORT("ERROR in get perm_c parmetis.");
+				}
+				else
+				{
+					get_perm_c_dist(iam, permc_spec, &GA, perm_c);
+				}
+			}
+
+			stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+			/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+			   (a.k.a. column etree), depending on the choice of ColPerm.
+			   Adjust perm_c[] to be consistent with a postorder of etree.
+			   Permute columns of A to form A*Pc'. */
+			if (Fact != SamePattern_SameRowPerm)
+			{
+				if (parSymbFact == NO)
+				{
+
+					int_t *GACcolbeg, *GACcolend, *GACrowind;
+
+					sp_colorder(options, &GA, perm_c, etree, &GAC);
+
+					/* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
+					GACstore = (NCPformat *)GAC.Store;
+					GACcolbeg = GACstore->colbeg;
+					GACcolend = GACstore->colend;
+					GACrowind = GACstore->rowind;
+					for (j = 0; j < n; ++j)
+					{
+						for (i = GACcolbeg[j]; i < GACcolend[j]; ++i)
+						{
+							irow = GACrowind[i];
+							GACrowind[i] = perm_c[irow];
+						}
+					}
+
+					/* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
+					   the nonzero data structures for L & U. */
+#if (PRNTlevel >= 1)
+					if (!iam)
+						printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
+							   sp_ienv_dist(2, options), sp_ienv_dist(3, options), sp_ienv_dist(6, options));
+#endif
+					t = SuperLU_timer_();
+					if (!(Glu_freeable = (Glu_freeable_t *)
+							  SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
+						ABORT("Malloc fails for Glu_freeable.");
+
+					/* Every process does this. */
+					iinfo = symbfact(options, iam, &GAC, perm_c, etree,
+									 Glu_persist, Glu_freeable);
+
+					stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+					if (iinfo < 0)
+					{
+						/* Successful return */
+						QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+
+#if (PRNTlevel >= 1)
+						if (!iam)
+						{
+							printf("\tNo of supers %ld\n",
+								   (long)Glu_persist->supno[n - 1] + 1);
+							printf("\tSize of G(L) %ld\n", (long)Glu_freeable->xlsub[n]);
+							printf("\tSize of G(U) %ld\n", (long)Glu_freeable->xusub[n]);
+							printf("\tint %lu, short %lu, float %lu, double %lu\n",
+								   sizeof(int_t), sizeof(short),
+								   sizeof(float), sizeof(double));
+							printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+								   symb_mem_usage.for_lu * 1e-6,
+								   symb_mem_usage.total * 1e-6,
+								   symb_mem_usage.expansions);
+						}
+#endif
+					}
+					else
+					{
+						if (!iam)
+						{
+							fprintf(stderr, "symbfact() error returns %d\n",
+									(int)iinfo);
+							exit(-1);
+						}
+					}
+
+				} /* end serial symbolic factorization */
+				else
+				{ /* parallel symbolic factorization */
+					t = SuperLU_timer_();
+					flinfo =
+						symbfact_dist(options, nprocs_num, noDomains,
+									  A, perm_c, perm_r,
+									  sizes, fstVtxSep, &Pslu_freeable,
+									  &(grid->comm), &symb_comm,
+									  &symb_mem_usage);
+					stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+					if (flinfo > 0)
+						ABORT("Insufficient memory for parallel symbolic factorization.");
+				}
+
+				/* Destroy GA */
+				if (parSymbFact == NO || options->RowPerm != NO)
+					Destroy_CompCol_Matrix_dist(&GA);
+				if (parSymbFact == NO)
+					Destroy_CompCol_Permuted_dist(&GAC);
+
+			} /* end if Fact not SamePattern_SameRowPerm */
+
+#if (DEBUGlevel >= 2) // Sherry
+			if (!iam)
+				PrintInt10("perm_c", m, perm_c);
+#endif
+			if (sizes)
+				SUPERLU_FREE(sizes);
+			if (fstVtxSep)
+				SUPERLU_FREE(fstVtxSep);
+			if (symb_comm != MPI_COMM_NULL)
+				MPI_Comm_free(&symb_comm);
+
+			if (parSymbFact == NO || Fact == SamePattern_SameRowPerm)
+			{
+				/* Apply column permutation to the original distributed A */
+				for (j = 0; j < nnz_loc; ++j)
+					colind[j] = perm_c[colind[j]];
+
+				/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+				   NOTE: the row permutation Pc*Pr is applied internally in the
+				   distribution routine. */
+				t = SuperLU_timer_();
+
+				nsupers = getNsupers(n, LUstruct->Glu_persist);
+				
+				if(Fact != SamePattern_SameRowPerm){
+					LUstruct->trf3Dpart = SUPERLU_MALLOC(sizeof(dtrf3Dpartition_t));
+					newTrfPartitionInit(nsupers, LUstruct, grid3d);
+					trf3Dpartition=LUstruct->trf3Dpart;
+				}
+
+				dist_mem_use = pddistribute3d_Yang(options, n, A, ScalePermstruct,
+											Glu_freeable, LUstruct, grid3d);					
+					
+				if(Fact != SamePattern_SameRowPerm){
+					/* now that LU structure has been scattered, initialize the LU and buffers */
+					dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs,
+										trf3Dpartition->sForests, LUstruct, grid3d);	
+					dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+					dLluBufInit(LUvsb, LUstruct);
+					trf3Dpartition->LUvsb = LUvsb;
+					trf3Dpartition->iperm_c_supno = create_iperm_c_supno(nsupers, options, LUstruct, grid3d);
+				}
+
+
+				stat->utime[DIST] = SuperLU_timer_() - t;
+
+				/* Deallocate storage used in symbolic factorization. */
+				if (Fact != SamePattern_SameRowPerm)
+				{
+					iinfo = symbfact_SubFree(Glu_freeable);
+					SUPERLU_FREE(Glu_freeable);
+				}
+
+			}
+			else
+			{
+				/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+				   NOTE: the row permutation Pc*Pr is applied internally in the
+				   distribution routine. */
+				/* Apply column permutation to the original distributed A */
+				for (j = 0; j < nnz_loc; ++j)
+					colind[j] = perm_c[colind[j]];
+
+				t = SuperLU_timer_();
+				dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct,
+												&Pslu_freeable, LUstruct, grid);
+				if (dist_mem_use > 0)
+					ABORT("Not enough memory available for dist_psymbtonum\n");
+
+				stat->utime[DIST] = SuperLU_timer_() - t;
+
+				ABORT("ddist_psymbtonum does not yet work with 3D factorization\n");
+
+			}
+
+			/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
+
+
+		/* Perform numerical factorization in parallel on all process layers.*/
+
+		/* nvshmem related. The nvshmem_malloc has to be called before trs_compute_communication_structure, otherwise solve is much slower*/
+		#ifdef HAVE_NVSHMEM  
+			int nc = CEILING( nsupers, grid->npcol);
+			int nr = CEILING( nsupers, grid->nprow);
+			int flag_bc_size = RDMA_FLAG_SIZE * (nc+1);
+			int flag_rd_size = RDMA_FLAG_SIZE * nr * 2;    
+			int my_flag_bc_size = RDMA_FLAG_SIZE * (nc+1);
+			int my_flag_rd_size = RDMA_FLAG_SIZE * nr * 2;
+			int maxrecvsz = sp_ienv_dist(3, options)* nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+			int ready_x_size = maxrecvsz*nc;
+			int ready_lsum_size = 2*maxrecvsz*nr;
+			if (getenv("SUPERLU_ACC_SOLVE")){
+			nv_init_wrapper(grid->comm);
+			prepare_multiGPU_buffers(flag_bc_size,flag_rd_size,ready_x_size,ready_lsum_size,my_flag_bc_size,my_flag_rd_size);
+			}
+		#endif
+
+
+
+
+
+		SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t));
+		SCT_init(SCT);
+
+#if (PRNTlevel >= 1)
+		if (grid3d->iam == 0)
+		{
+			printf("after 3D initialization.\n");
+			fflush(stdout);
+		}
+#endif
+
+
+
+
+
+		t = SuperLU_timer_();
+
+		/*factorize in grid 1*/
+		// if(grid3d->zscp.Iam)
+		// get environment variable TRF3DVERSION
+#ifdef GPU_ACC
+		if (gpu3dVersion == 1)
+		{ /* this is the new C++ code in TRF3dV100/ directory */
+		  
+			if (!grid3d->iam)
+				printf("Using pdgstrf3d+gpu version 1 for Summit\n");
+#if 0
+			pdgstrf3d_summit(options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+				  grid3d, stat, info);
+#else
+			int_t ldt = sp_ienv_dist(3, options); /* Size of maximum supernode */
+			double s_eps = smach_dist("Epsilon");
+			double thresh = s_eps * anorm;
+
+			/* call constructor in C++ code */
+			LUgpu = createLUgpuHandle(nsupers, ldt, trf3Dpartition, LUstruct, grid3d,
+						  SCT, options, stat, thresh, info);
+			
+			/* call pdgstrf3d() in C++ code */
+			pdgstrf3d_LUpackedInterface(LUgpu);
+			
+			copyLUGPU2Host(LUgpu, LUstruct);
+			destroyLUgpuHandle(LUgpu);
+
+			// print other stuff
+			// if (!grid3d->zscp.Iam)
+			// 	SCT_printSummary(grid, SCT);
+			reduceStat(FACT, stat, grid3d);
+
+#endif
+		}
+		else /* this is the old C code, with less GPU offload */
+#endif /* matching ifdef GPU_ACC */
+		{
+
+			pdgstrf3d(options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+					  grid3d, stat, info);
+		
+			// dDumpLblocks3D(nsupers, grid3d, LUstruct->Glu_persist, LUstruct->Llu);
+		
+		
+		}
+		if (getenv("NEW3DSOLVE")){
+			dbroadcastAncestor3d(trf3Dpartition, LUstruct, grid3d, SCT);
+		}
+
+		if ( options->Fact != SamePattern_SameRowPerm) {
+			if (getenv("NEW3DSOLVE") && Solve3D==true){
+				trs_compute_communication_structure(options, n, LUstruct,
+							ScalePermstruct, trf3Dpartition->supernodeMask, grid, stat);
+			}else{
+				int* supernodeMask = int32Malloc_dist(nsupers);
+				for(int ii=0; ii<nsupers; ii++)
+					supernodeMask[ii]=1;
+				trs_compute_communication_structure(options, n, LUstruct,
+							ScalePermstruct, supernodeMask, grid, stat);
+				SUPERLU_FREE(supernodeMask);
+			}
+		}
+
+
+		stat->utime[FACT] = SuperLU_timer_() - t;
+
+		/*factorize in grid 1*/
+		// if(grid3d->zscp.Iam)
+		double tgather = SuperLU_timer_();
+		if(Solve3D==false){
+		dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT);
+		}
+		SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+		/*print stats for bottom grid*/
+
+		// Write LU to file
+		int writeLU = 0;
+		if (getenv("WRITELU"))
+		{
+			writeLU = atoi(getenv("WRITELU"));
+		}
+
+		if (writeLU)
+		{
+			if (!grid3d->zscp.Iam)
+				writeLUtoDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct);
+		}
+
+		int checkLU = 0;
+		if (getenv("CHECKLU"))
+		{
+			checkLU = atoi(getenv("CHECKLU"));
+		}
+
+		if (checkLU)
+		{
+			if (!grid3d->zscp.Iam)
+				checkLUFromDisk(nsupers, LUstruct->Glu_persist->xsup, LUstruct);
+		}
+
+#if (PRNTlevel >= 0)
+		if (!grid3d->zscp.Iam)
+		{
+			SCT_print(grid, SCT);
+			SCT_print3D(grid3d, SCT);
+		}
+		SCT_printComm3D(grid3d, SCT);
+
+		/*print memory usage*/
+		d3D_printMemUse(trf3Dpartition, LUstruct, grid3d);
+
+		SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+		/*print stats for bottom grid*/
+		/*print forest weight and costs*/
+		printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
+		/*reduces stat from all the layers*/
+#endif
+
+		SCT_free(SCT);
+
+	} /* end if not Factored ... factor on all process layers */
+
+	if (grid3d->zscp.Iam == 0 )
+	{ // only process layer 0
+		if (!factored)
+		{
+			if (options->PrintStat)
+			{
+				int_t TinyPivots;
+				float for_lu, total, avg, loc_max;
+				float mem_stage[3];
+				struct { float val; int rank; } local_struct, global_struct;
+
+				MPI_Reduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+						   MPI_SUM, 0, grid->comm );
+				stat->TinyPivots = TinyPivots;
+
+				/*-- Compute high watermark of all stages --*/
+				if (parSymbFact == TRUE)
+				{
+					/* The memory used in the redistribution routine
+				   includes the memory used for storing the symbolic
+				   structure and the memory allocated for numerical
+				   factorization */
+					mem_stage[0] = (-flinfo); /* symbfact step */
+					mem_stage[1] = (-dist_mem_use);      /* distribution step */
+					loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1]);
+					if (options->RowPerm != NO )
+						loc_max = SUPERLU_MAX(loc_max, GA_mem_use);
+				}
+				else
+				{
+					mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */
+					mem_stage[1] = symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu;            /* distribution step */
+					loc_max = SUPERLU_MAX(mem_stage[0], mem_stage[1] );
+				}
+
+				dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+				mem_stage[2] = num_mem_usage.total;  /* numerical factorization step */
+
+				loc_max = SUPERLU_MAX(loc_max, mem_stage[2] ); /* local max of 3 stages */
+
+				local_struct.val = loc_max;
+				local_struct.rank = grid->iam;
+				MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm );
+				int all_highmark_rank = global_struct.rank;
+				float all_highmark_mem = global_struct.val * 1e-6;
+
+				MPI_Reduce( &loc_max, &avg,
+						   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+				MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+						   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+				MPI_Reduce( &num_mem_usage.total, &total,
+						   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+
+				/*-- Compute memory usage of numerical factorization --*/
+				local_struct.val = num_mem_usage.for_lu;
+				MPI_Reduce(&local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm);
+				int lu_max_rank = global_struct.rank;
+				float lu_max_mem = global_struct.val * 1e-6;
+				
+				local_struct.val = stat->peak_buffer;
+				MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm );
+	        	int buffer_peak_rank = global_struct.rank;
+	        	float buffer_peak = global_struct.val*1e-6;
+				if (iam == 0)
+				{
+					printf("\n** Memory Usage **********************************\n");
+					printf("** Total highmark (MB):\n"
+						   "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+						   avg * 1e-6,
+						   avg / grid->nprow / grid->npcol * 1e-6,
+						   all_highmark_mem);
+					printf("    Max at rank %d, different stages (MB):\n"
+						   "\t. symbfact        %8.2f\n"
+						   "\t. distribution    %8.2f\n"
+						   "\t. numfact         %8.2f\n",
+						   all_highmark_rank, mem_stage[0] * 1e-6, mem_stage[1] * 1e-6, mem_stage[2] * 1e-6);
+					printf("** NUMfact space (MB): (sum-of-all-processes)\n"
+						   "    L\\U :        %8.2f |  Total : %8.2f\n",
+						   for_lu * 1e-6, total * 1e-6);
+					printf("\t. max at rank %d, max L+U memory (MB): %8.2f\n"
+						   "\t. max at rank %d, peak buffer (MB):    %8.2f\n",
+						   lu_max_rank, lu_max_mem,
+						   buffer_peak_rank, buffer_peak);
+					printf("**************************************************\n\n");
+					printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots);
+					fflush(stdout);
+				}
+			} /* end printing stats */
+
+		} /* end if not Factored */
+    }
+
+		if(Solve3D){
+
+			if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+			/* Need to reset the solve's communication pattern,
+			because perm_r[] and/or perm_c[] is changed.    */
+			if ( options->SolveInitialized == YES ) { /* Initialized before */
+				dSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+				pdgstrs_delete_device_lsum_x(SOLVEstruct);
+				options->SolveInitialized = NO;   /* Reset the solve state */
+			}
+			}
+
+			if (getenv("NEW3DSOLVE")){
+
+
+			if (options->DiagInv == YES && (Fact != FACTORED))
+			{
+				pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+
+				// The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d.
+				int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+#if (defined(GPU_ACC) && defined(GPU_SOLVE))
+
+				pdconvertU(options, grid, LUstruct, stat, n);
+
+				// checkGPU(gpuFree(LUstruct->Llu->d_xsup));
+				// checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_ilsum));
+				// checkGPU(gpuFree(LUstruct->Llu->d_grid));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat));
+
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice));  				
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t)));
+    			// checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice));
+#endif
+if (getenv("SUPERLU_ACC_SOLVE")){
+#ifdef GPU_ACC
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+								   (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+								   (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+								   (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+#endif
+}
+			}
+			}
+		}else{ /* if(Solve3D) */
+
+			if (grid3d->zscp.Iam == 0){  /* on 2D grid-0 */
+
+			if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+			/* Need to reset the solve's communication pattern,
+			because perm_r[] and/or perm_c[] is changed.    */
+			if ( options->SolveInitialized == YES ) { /* Initialized before */
+				dSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+				pdgstrs_delete_device_lsum_x(SOLVEstruct);
+				options->SolveInitialized = NO;   /* Reset the solve state */
+			}
+			}
+
+#if (defined(GPU_ACC) && defined(GPU_SOLVE))
+			if (options->DiagInv == NO)
+			{
+				if (iam == 0)
+				{
+					printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n");
+					printf("           otherwise, use CPU trisolve\n");
+					fflush(stdout);
+				}
+				// exit(0);  // Sherry: need to return an error flag
+			}
+#endif
+
+			if (options->DiagInv == YES && (Fact != FACTORED))
+			{
+				pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+
+				// The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d.
+				int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+#ifdef GPU_ACC
+
+				pdconvertU(options, grid, LUstruct, stat, n);
+
+				// checkGPU(gpuFree(LUstruct->Llu->d_xsup));
+				// checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_ilsum));
+				// checkGPU(gpuFree(LUstruct->Llu->d_grid));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat));
+
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice));  					
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t)));
+    			// checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice));
+#endif
+
+if (getenv("SUPERLU_ACC_SOLVE")){
+#ifdef GPU_ACC
+
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+								   (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+								   (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+								   (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+#endif
+}
+			}
+			}
+		}
+
+
+		/* ------------------------------------------------------------
+		   Compute the solution matrix X.
+		   ------------------------------------------------------------ */
+		if ((nrhs > 0) && (*info == 0))
+		{
+		if (options->SolveInitialized == NO){
+			if (getenv("SUPERLU_ACC_SOLVE")){
+			if (getenv("NEW3DSOLVE") && Solve3D==true){
+				pdgstrs_init_device_lsum_x(options, n, m_loc, nrhs, grid,LUstruct, SOLVEstruct,trf3Dpartition->supernodeMask);	
+			}else{
+				int* supernodeMask = int32Malloc_dist(nsupers);
+				for(int ii=0; ii<nsupers; ii++)
+					supernodeMask[ii]=1;
+				pdgstrs_init_device_lsum_x(options, n, m_loc, nrhs, grid,LUstruct, SOLVEstruct,supernodeMask);	
+				SUPERLU_FREE(supernodeMask);
+			}
+			}
+		}
+
+		stat->utime[SOLVE] = 0.0;
+		if(Solve3D){
+
+			// if (!(b_work = doubleMalloc_dist(n)))
+			// 	ABORT("Malloc fails for b_work[]");
+			/* ------------------------------------------------------
+			   Scale the right-hand side if equilibration was performed
+			   ------------------------------------------------------*/
+			if (notran)
+			{
+				if (rowequ)
+				{
+					b_col = B;
+					for (j = 0; j < nrhs; ++j)
+					{
+						irow = fst_row;
+						for (i = 0; i < m_loc; ++i)
+						{
+							b_col[i] *= R[irow];
+							++irow;
+						}
+						b_col += ldb;
+					}
+				}
+			}
+			else if (colequ)
+			{
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				{
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					{
+						b_col[i] *= C[irow];
+						++irow;
+					}
+					b_col += ldb;
+				}
+			}
+
+			/* Save a copy of the right-hand side. */
+			ldx = ldb;
+			if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs)))
+				ABORT("Malloc fails for X[]");
+			x_col = X;
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			{
+				for (i = 0; i < m_loc; ++i)
+					x_col[i] = b_col[i];
+				x_col += ldx;
+				b_col += ldb;
+			}
+
+			/* ------------------------------------------------------
+			   Solve the linear system.
+			   ------------------------------------------------------*/
+		
+			if (options->SolveInitialized == NO)
+			/* First time */
+			/* Inside this routine, SolveInitialized is set to YES.
+			For repeated call to pdgssvx3d(), no need to re-initialilze
+			the Solve data & communication structures, unless a new
+			factorization with Fact == DOFACT or SamePattern is asked for. */
+			{
+				dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct,
+							grid, SOLVEstruct);
+			}			
+			if (getenv("NEW3DSOLVE")){
+				pdgstrs3d_newsolve (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X,
+				m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info);
+			}else{
+				pdgstrs3d (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X,
+				m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info);
+			}
+			if (options->IterRefine)
+				{
+				/* Improve the solution by iterative refinement. */
+				int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+				t = SuperLU_timer_ ();
+				if (options->RefineInitialized == NO || Fact == DOFACT) {
+					/* All these cases need to re-initialize gsmv structure */
+					if (options->RefineInitialized)
+					pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+					pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+						SOLVEstruct->gsmv_comm);
+
+					/* Save a copy of the transformed local col indices
+					in colind_gsmv[]. */
+					if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+					if (!(it = intMalloc_dist (nnz_loc)))
+					ABORT ("Malloc fails for colind_gsmv[]");
+					colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+					for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+					options->RefineInitialized = YES;
+				}
+				else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+					double at;
+					int_t k, jcol, p;
+					/* Swap to beginning the part of A corresponding to the
+					local part of X, as was done in pdgsmv_init() */
+					for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+					k = rowptr[i];
+					for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+						{
+						jcol = colind[j];
+						p = SOLVEstruct->row_to_proc[jcol];
+						if (p == iam)
+							{	/* Local */
+							at = a[k];
+							a[k] = a[j];
+							a[j] = at;
+							++k;
+							}
+						}
+					}
+
+					/* Re-use the local col indices of A obtained from the
+					previous call to pdgsmv_init() */
+					for (i = 0; i < nnz_loc; ++i)
+					colind[i] = colind_gsmv[i];
+				}
+
+				if (nrhs == 1)
+					{	/* Use the existing solve structure */
+					SOLVEstruct1 = SOLVEstruct;
+					}
+				else {
+				/* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS.
+			So we use SOLVEstruct1 for the refinement step.
+			*/
+					if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+						SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+						ABORT ("Malloc fails for SOLVEstruct1");
+					/* Copy the same stuff */
+					SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+					SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+					SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+					SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+					SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+					SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+					SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+
+					/* Initialize the *gstrs_comm for 1 RHS. */
+					if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+						SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+						ABORT ("Malloc fails for gstrs_comm[]");
+					pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+							LUstruct->Glu_persist, SOLVEstruct1);
+					if (getenv("SUPERLU_ACC_SOLVE")){
+					int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+					pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,trf3Dpartition->supernodeMask);		 
+					}
+					}
+
+				pdgsrfs3d (options, n, A, anorm, LUstruct, ScalePermstruct, grid3d, trf3Dpartition,
+					B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+				/* Deallocate the storage associated with SOLVEstruct1 */
+				if (nrhs > 1)
+					{
+					pdgstrs_delete_device_lsum_x(SOLVEstruct1);
+					pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+					SUPERLU_FREE (SOLVEstruct1);
+					}
+
+				stat->utime[REFINE] = SuperLU_timer_ () - t;
+				} /* end IterRefine */			
+		}else{
+
+			if (grid3d->zscp.Iam == 0){  /* on 2D grid-0 */
+
+			/* ------------------------------------------------------
+			   Scale the right-hand side if equilibration was performed
+			   ------------------------------------------------------*/
+			if (notran)
+			{
+				if (rowequ)
+				{
+					b_col = B;
+					for (j = 0; j < nrhs; ++j)
+					{
+						irow = fst_row;
+						for (i = 0; i < m_loc; ++i)
+						{
+							b_col[i] *= R[irow];
+							++irow;
+						}
+						b_col += ldb;
+					}
+				}
+			}
+			else if (colequ)
+			{
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				{
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					{
+						b_col[i] *= C[irow];
+						++irow;
+					}
+					b_col += ldb;
+				}
+			}
+
+			/* Save a copy of the right-hand side. */
+			ldx = ldb;
+			if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs)))
+				ABORT("Malloc fails for X[]");
+			x_col = X;
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			{
+				for (i = 0; i < m_loc; ++i)
+					x_col[i] = b_col[i];
+				x_col += ldx;
+				b_col += ldb;
+			}
+
+			/* ------------------------------------------------------
+			   Solve the linear system.
+			   ------------------------------------------------------*/
+			if (options->SolveInitialized == NO)
+			/* First time */
+			/* Inside this routine, SolveInitialized is set to YES.
+			For repeated call to pdgssvx3d(), no need to re-initialilze
+			the Solve data & communication structures, unless a new
+			factorization with Fact == DOFACT or SamePattern is asked for. */
+			{
+				dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct,
+							grid, SOLVEstruct);
+			}
+			pdgstrs(options, n, LUstruct, ScalePermstruct, grid, X, m_loc,
+				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+
+			/* ------------------------------------------------------------
+			Use iterative refinement to improve the computed solution and
+			compute error bounds and backward error estimates for it.
+			------------------------------------------------------------ */
+			if (options->IterRefine)
+				{
+				/* Improve the solution by iterative refinement. */
+				int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+				t = SuperLU_timer_ ();
+				if (options->RefineInitialized == NO || Fact == DOFACT) {
+					/* All these cases need to re-initialize gsmv structure */
+					if (options->RefineInitialized)
+					pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+					pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+						SOLVEstruct->gsmv_comm);
+
+					/* Save a copy of the transformed local col indices
+					in colind_gsmv[]. */
+					if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+					if (!(it = intMalloc_dist (nnz_loc)))
+					ABORT ("Malloc fails for colind_gsmv[]");
+					colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+					for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+					options->RefineInitialized = YES;
+				}
+				else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+					double at;
+					int_t k, jcol, p;
+					/* Swap to beginning the part of A corresponding to the
+					local part of X, as was done in pdgsmv_init() */
+					for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+					k = rowptr[i];
+					for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+						{
+						jcol = colind[j];
+						p = SOLVEstruct->row_to_proc[jcol];
+						if (p == iam)
+							{	/* Local */
+							at = a[k];
+							a[k] = a[j];
+							a[j] = at;
+							++k;
+							}
+						}
+					}
+
+					/* Re-use the local col indices of A obtained from the
+					previous call to pdgsmv_init() */
+					for (i = 0; i < nnz_loc; ++i)
+					colind[i] = colind_gsmv[i];
+				}
+
+				if (nrhs == 1)
+					{	/* Use the existing solve structure */
+					SOLVEstruct1 = SOLVEstruct;
+					}
+				else {
+				/* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS.
+			So we use SOLVEstruct1 for the refinement step.
+			*/
+					if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+						SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+						ABORT ("Malloc fails for SOLVEstruct1");
+					/* Copy the same stuff */
+					SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+					SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+					SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+					SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+					SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+					SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+					SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+
+					/* Initialize the *gstrs_comm for 1 RHS. */
+					if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+						SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+						ABORT ("Malloc fails for gstrs_comm[]");
+					pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+							LUstruct->Glu_persist, SOLVEstruct1);
+					if (getenv("SUPERLU_ACC_SOLVE")){
+					int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+					int* supernodeMask = int32Malloc_dist(nsupers);
+					for(int ii=0; ii<nsupers; ii++)
+						supernodeMask[ii]=1;
+					pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,supernodeMask);		 
+					SUPERLU_FREE(supernodeMask);
+					}
+					}
+
+				pdgsrfs (options, n, A, anorm, LUstruct, ScalePermstruct, grid,
+					B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+				/* Deallocate the storage associated with SOLVEstruct1 */
+				if (nrhs > 1)
+					{
+					pdgstrs_delete_device_lsum_x(SOLVEstruct1);
+					pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+					SUPERLU_FREE (SOLVEstruct1);
+					}
+
+				stat->utime[REFINE] = SuperLU_timer_ () - t;
+				} /* end IterRefine */
+			}
+		}
+
+if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
+	{
+		/* Permute the solution matrix B <= Pc'*X. */
+		pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+					SOLVEstruct->inv_perm_c,
+					X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+		for (i = 0; i < m_loc; ++i)
+		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+#endif
+			/* Transform the solution matrix X to a solution of the original
+			   system before the equilibration. */
+			if (notran)
+			{
+				if (colequ)
+				{
+					b_col = B;
+					for (j = 0; j < nrhs; ++j)
+					{
+						irow = fst_row;
+						for (i = 0; i < m_loc; ++i)
+						{
+							b_col[i] *= C[irow];
+							++irow;
+						}
+						b_col += ldb;
+				    }
+			    }
+		    }
+			else if (rowequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				    {
+					b_col[i] *= R[irow];
+					++irow;
+				    }
+				b_col += ldb;
+			    }
+		    }
+
+		// SUPERLU_FREE (b_work);
+	}
+	if (grid3d->zscp.Iam == 0 || Solve3D)
+		SUPERLU_FREE (X);
+
+	} /* end if nrhs > 0 and factor successful */
+
+#if ( PRNTlevel>=1 )
+	if (!grid3d->iam) {
+	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
+#endif
+
+
+	if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
+	/* Deallocate R and/or C if it was not used. */
+	if (Equil && Fact != SamePattern_SameRowPerm)
+	    {
+		switch (ScalePermstruct->DiagScale) {
+		    case NOEQUIL:
+			SUPERLU_FREE (R);
+			SUPERLU_FREE (C);
+			break;
+		    case ROW:
+			SUPERLU_FREE (C);
+			break;
+		    case COL:
+			SUPERLU_FREE (R);
+			break;
+	            default: break;
+		}
+	}
+
+#if 0
+	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+	    Destroy_CompCol_Permuted_dist (&GAC);
+#endif
+
+	} /* process layer 0 done solve */
+
+	/* Scatter the solution from 2D grid-0 to 3D grid */
+	if (nrhs > 0)
+		dScatter_B3d(A3d, grid3d);
+
+	B = A3d->B3d;		 // B is now assigned back to B3d on return
+	A->Store = Astore3d; // restore Astore to 3D
+
+#if (DEBUGlevel >= 1)
+	CHECK_MALLOC(iam, "Exit pdgssvx3d()");
+#endif
+}
diff --git a/SRC/pdgssvx3d_2pass_Yang.c b/SRC/pdgssvx3d_2pass_Yang.c
new file mode 100755
index 00000000..036bc8f6
--- /dev/null
+++ b/SRC/pdgssvx3d_2pass_Yang.c
@@ -0,0 +1,2150 @@
+
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 7.2) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ * October 5, 2021
+ * Last update: November 8, 2021  v7.2.0
+ */
+#include "superlu_ddefs.h"
+#include "TRF3dV100/superlu_summit.h"
+#include "pddistribute3d.h"
+#include "ssvx3dAux.c"
+int_t dgatherAllFactoredLU3d( dtrf3Dpartition_t*  trf3Dpartition,
+			   dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT );
+#include <stdbool.h>
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * PDGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input) SuperMatrix* (local); A resides on all 3D processes.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *
+ *	   Internally, A is gathered on 2D processs grid-0, call it A2d.
+ *         On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A2d is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) dLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (dLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util_dist.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * </pre>
+ */
+// dSOLVEstruct3d_t * SOLVEstruct,
+// SOLVEstruct->A3d
+
+int writeLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct)
+{
+
+	if (getenv("LUFILE"))
+	{
+		FILE *fp = fopen(getenv("LUFILE"), "w");
+		printf("writing to %s", getenv("LUFILE"));
+		for (int i = 0; i < nsupers; i++)
+		{
+			if (LUstruct->Llu->Lrowind_bc_ptr[i])
+			{
+				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
+				double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
+
+				int_t len = lsub[1]; /* LDA of the nzval[] */
+				int_t len2 = SuperSize(i) * len;
+				fwrite(nzval, sizeof(double), len2, fp); // assume fp will be incremented
+			}
+
+			if (LUstruct->Llu->Ufstnz_br_ptr[i])
+			{
+				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
+				double *nzval = LUstruct->Llu->Unzval_br_ptr[i];
+				int_t lenv = usub[1];
+
+				fwrite(nzval, sizeof(double), lenv, fp); // assume fp will be incremented
+			}
+		}
+
+		fclose(fp);
+	}
+	else
+	{
+		printf("Please set environment variable LUFILE to write\n..bye bye");
+		exit(0);
+	}
+	
+	return 0;
+}
+
+#define EPSILON 1e-3
+
+static int checkArr(double *A, double *B, int n)
+{
+	for (int i = 0; i < n; i++)
+	{
+		assert(fabs(A[i] - B[i]) <= EPSILON * SUPERLU_MIN(fabs(A[i]), fabs(B[i])));
+	}
+
+	return 0;
+}
+
+int checkLUFromDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct)
+{
+	dLocalLU_t *Llu = LUstruct->Llu;
+
+	double *Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]);
+	double *Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]);
+
+	if (getenv("LUFILE"))
+	{
+		FILE *fp = fopen(getenv("LUFILE"), "r");
+		printf("reading from %s", getenv("LUFILE"));
+		for (int i = 0; i < nsupers; i++)
+		{
+			if (LUstruct->Llu->Lrowind_bc_ptr[i])
+			{
+				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
+				double *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
+
+				int_t len = lsub[1]; /* LDA of the nzval[] */
+				int_t len2 = SuperSize(i) * len;
+				fread(Lval_buf, sizeof(double), len2, fp); // assume fp will be incremented
+				checkArr(nzval, Lval_buf, len2);
+			}
+
+			if (LUstruct->Llu->Ufstnz_br_ptr[i])
+			{
+				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
+				double *nzval = LUstruct->Llu->Unzval_br_ptr[i];
+				int_t lenv = usub[1];
+
+				fread(Uval_buf, sizeof(double), lenv, fp); // assume fp will be incremented
+				checkArr(nzval, Uval_buf, lenv);
+			}
+		}
+		printf("CHecking LU from  %s is succesful ", getenv("LUFILE"));
+		fclose(fp);
+	}
+	else
+	{
+		printf("Please set environment variable LUFILE to read\n..bye bye");
+		exit(0);
+	}
+
+	return 0;
+}
+
+
+/*! \brief Dump the factored matrix L using matlab triple-let format
+ */
+void dDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d,
+		  Glu_persist_t *Glu_persist, dLocalLU_t *Llu)
+{
+    register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    double *nzval;
+	char filename[256];
+	FILE *fp, *fopen();
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int iam = grid->iam;
+	int iam3d = grid3d->iam;
+
+	// assert(grid->npcol*grid->nprow==1);
+
+	// count nonzeros in the first pass
+	nnzL = 0;
+	n = 0;
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+
+		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
+			nnzL ++;
+			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
+			n = nmax;
+		}
+
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
+	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
+
+	snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d);
+    printf("Dumping L factor to --> %s\n", filename);
+ 	if ( !(fp = fopen(filename, "w")) ) {
+			ABORT("File open failed");
+		}
+
+	if(grid->iam==0){
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
+	}
+
+     ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr]);
+#if 0
+			fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr]);
+#endif
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+ 	fclose(fp);
+
+} /* dDumpLblocks3D */
+
+
+
+
+
+
+
+
+void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
+			   dScalePermstruct_t *ScalePermstruct,
+			   double B[], int ldb, int nrhs, gridinfo3d_t *grid3d,
+			   dLUstruct_t *LUstruct, dSOLVEstruct_t *SOLVEstruct,
+			   double *berr, SuperLUStat_t *stat, int *info)
+{
+	NRformat_loc *Astore = A->Store;
+	SuperMatrix GA; /* Global A in NC format */
+	NCformat *GAstore;
+	double *a_GA;
+	SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
+	NCPformat *GACstore;
+	Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+	Glu_freeable_t *Glu_freeable = NULL;
+	/* The nonzero structures of L and U factors, which are
+	   replicated on all processrs.
+	   (lsub, xlsub) contains the compressed subscript of
+	   supernodes in L.
+	   (usub, xusub) contains the compressed subscript of
+	   nonzero segments in U.
+	   If options->Fact != SamePattern_SameRowPerm, they are
+	   computed by SYMBFACT routine, and then used by PDDISTRIBUTE
+	   routine. They will be freed after PDDISTRIBUTE routine.
+	   If options->Fact == SamePattern_SameRowPerm, these
+	   structures are not used.                                  */
+	yes_no_t parSymbFact = options->ParSymbFact;
+	fact_t Fact;
+	double *a;
+	int_t *colptr, *rowind;
+	int_t *perm_r;			/* row permutations from partial pivoting */
+	int_t *perm_c;			/* column permutation vector */
+	int_t *etree;			/* elimination tree */
+	int_t *rowptr, *colind; /* Local A in NR */
+	int_t colequ, Equil, factored, job, notran, rowequ, need_value;
+	int_t i, iinfo, j, irow, m, n, nnz, permc_spec;
+	int_t nnz_loc, m_loc, fst_row, icol;
+	int iam;
+	int ldx; /* LDA for matrix X (local). */
+	char equed[1], norm[1];
+	double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd;
+	double *X, *b_col, *b_work, *x_col;
+	double t;
+	float GA_mem_use;	/* memory usage by global A */
+	float dist_mem_use; /* memory usage during distribution */
+	superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+	float flinfo; /* track memory usage of parallel symbolic factorization */
+	bool Solve3D = true;
+	int_t nsupers;
+#if (PRNTlevel >= 2)
+	double dmin, dsum, dprod;
+#endif
+
+	dtrf3Dpartition_t *trf3Dpartition=LUstruct->trf3Dpart;
+	int gpu3dVersion = 0;
+	#ifdef GPU_ACC
+		// gpu3dVersion = 1;
+	if (getenv("GPU3DVERSION"))
+	{
+		gpu3dVersion = atoi(getenv("GPU3DVERSION"));
+	}
+
+	LUgpu_Handle LUgpu;
+	#endif 
+
+
+	LUstruct->dt = 'd';
+
+	// get the 2d grid
+	gridinfo_t *grid = &(grid3d->grid2d);
+	iam = grid->iam;
+
+	/* Test the options choices. */
+	*info = 0;
+	Fact = options->Fact;
+	validateInput_ssvx3d(options, A, ldb, nrhs, grid3d, info);
+
+	/* Initialization. */
+
+	options->Algo3d = YES;
+
+	/* definition of factored seen by each process layer */
+	factored = (Fact == FACTORED);
+
+	/* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+	   so that the names {ldb, B, and Astore} can be used internally.
+	   B3d and Astore3d will be assigned back to B and Astore on return.*/
+	int ldb3d = ldb;
+	NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+	NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+	/* B3d is aliased to B;
+	   B2d is allocated;
+	   B is then aliased to B2d for the following 2D solve;
+	*/
+	dGatherNRformat_loc3d_allgrid(Fact, (NRformat_loc *)A->Store,
+						  B, ldb, nrhs, grid3d, &A3d);  
+
+	B = (double *)A3d->B2d; /* B is now pointing to B2d,
+			   allocated in dGatherNRformat_loc3d.  */
+	// PrintDouble5("after gather B=B2d", ldb, B);
+
+	SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+				   multiple calls of pdgssvx3d()   */
+
+	NRformat_loc *Astore0 = A3d->A_nfmt; // on all grids
+	NRformat_loc *A_orig = A->Store;
+	//////
+
+#if (DEBUGlevel >= 1)
+	CHECK_MALLOC(iam, "Enter pdgssvx3d()");
+#endif
+
+	/* Perform preprocessing steps on process layer zero, including:
+	   gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps:
+	   - equilibration,
+	   - ordering,
+	   - symbolic factorization,
+	   - distribution of L & U                                      */
+
+		m = A->nrow;
+		n = A->ncol;
+		// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
+
+		// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
+		A->Store = Astore0; // on all grids
+		ldb = Astore0->m_loc;
+
+		/* The following code now works on all grids */
+		Astore = (NRformat_loc *)A->Store;
+		nnz_loc = Astore->nnz_loc;
+		m_loc = Astore->m_loc;
+		fst_row = Astore->fst_row;
+		a = (double *)Astore->nzval;
+		rowptr = Astore->rowptr;
+		colind = Astore->colind;
+
+		/* Structures needed for parallel symbolic factorization */
+		int_t *sizes, *fstVtxSep;
+		int noDomains, nprocs_num;
+		MPI_Comm symb_comm; /* communicator for symbolic factorization */
+		int col, key;		/* parameters for creating a new communicator */
+		Pslu_freeable_t Pslu_freeable;
+
+		sizes = NULL;
+		fstVtxSep = NULL;
+		symb_comm = MPI_COMM_NULL;
+
+		Equil = (!factored && options->Equil == YES);
+		notran = (options->Trans == NOTRANS);
+
+		iam = grid->iam;
+		job = 5;
+		/* Extract equilibration status from a previous factorization */
+		if (factored || (Fact == SamePattern_SameRowPerm && Equil))
+		{
+			rowequ = (ScalePermstruct->DiagScale == ROW) ||
+					 (ScalePermstruct->DiagScale == BOTH);
+			colequ = (ScalePermstruct->DiagScale == COL) ||
+					 (ScalePermstruct->DiagScale == BOTH);
+		}
+		else
+		{
+			rowequ = colequ = FALSE;
+		}
+
+		/* Not factored & ask for equilibration, then alloc RC */
+		if (Equil && Fact != SamePattern_SameRowPerm)
+			dallocScalePermstruct_RC(ScalePermstruct, m, n);
+
+		/* The following arrays are replicated on all processes. */
+		perm_r = ScalePermstruct->perm_r;
+		perm_c = ScalePermstruct->perm_c;
+		etree = LUstruct->etree;
+		R = ScalePermstruct->R;
+		C = ScalePermstruct->C;
+
+		/* ------------------------------------------------------------
+		   Diagonal scaling to equilibrate the matrix.
+		   ------------------------------------------------------------ */
+		if (Equil)
+		{
+			scaleMatrixDiagonally(Fact, ScalePermstruct,
+								  A, stat, grid, &rowequ, &colequ, &iinfo);
+			if (iinfo < 0)
+				return; // return if error
+
+		} /* end if Equil ... LAPACK style, not involving MC64 */
+
+		if (!factored)
+		{ /* Skip this if already factored. */
+			/*
+			 * Gather A from the distributed compressed row format to
+			 * global A in compressed column format.
+			 * Numerical values are gathered only when a row permutation
+			 * for large diagonal is sought after.
+			 */
+			if (Fact != SamePattern_SameRowPerm &&
+				(parSymbFact == NO || options->RowPerm != NO))
+			{
+
+				need_value = (options->RowPerm == LargeDiag_MC64);
+
+				pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
+
+				GAstore = (NCformat *)GA.Store;
+				colptr = GAstore->colptr;
+				rowind = GAstore->rowind;
+				nnz = GAstore->nnz;
+				GA_mem_use = (nnz + n + 1) * sizeof(int_t);
+
+				if (need_value)
+				{
+					a_GA = (double *)GAstore->nzval;
+					GA_mem_use += nnz * sizeof(double);
+				}
+
+				else
+					assert(GAstore->nzval == NULL);
+			}
+
+			/* ------------------------------------------------------------
+			   Find the row permutation for A.
+			------------------------------------------------------------ */
+			perform_row_permutation(
+				options, Fact, ScalePermstruct, LUstruct,
+				m, n, grid, A, &GA, stat, job, Equil,
+				&rowequ, &colequ, &iinfo);
+
+		} /* end if (!factored) */
+
+		/* Compute norm(A), which will be used to adjust small diagonal. */
+		if (!factored || options->IterRefine)
+			anorm = computeA_Norm(notran, A, grid);
+
+		/* ------------------------------------------------------------
+		   Perform ordering and symbolic factorization
+		   ------------------------------------------------------------ */
+		if (!factored)
+		{
+			t = SuperLU_timer_();
+			/*
+			 * Get column permutation vector perm_c[], according to permc_spec:
+			 *   permc_spec = NATURAL:  natural ordering
+			 *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+			 *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+			 *   permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A
+			 *   permc_spec = PARMETIS: parallel METIS on structure of A'+A
+			 *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+			 */
+			permc_spec = options->ColPerm;
+
+			if (parSymbFact == YES || permc_spec == PARMETIS)
+			{
+				nprocs_num = grid->nprow * grid->npcol;
+				noDomains = (int)(pow(2, ((int)LOG2(nprocs_num))));
+
+				/* create a new communicator for the first noDomains
+				   processes in grid->comm */
+				key = iam;
+				if (iam < noDomains)
+					col = 0;
+				else
+					col = MPI_UNDEFINED;
+				MPI_Comm_split(grid->comm, col, key, &symb_comm);
+
+				if (permc_spec == NATURAL || permc_spec == MY_PERMC)
+				{
+					if (permc_spec == NATURAL)
+					{
+						for (j = 0; j < n; ++j)
+							perm_c[j] = j;
+					}
+					if (!(sizes = intMalloc_dist(2 * noDomains)))
+						ABORT("SUPERLU_MALLOC fails for sizes.");
+					if (!(fstVtxSep = intMalloc_dist(2 * noDomains)))
+						ABORT("SUPERLU_MALLOC fails for fstVtxSep.");
+					for (i = 0; i < 2 * noDomains - 2; ++i)
+					{
+						sizes[i] = 0;
+						fstVtxSep[i] = 0;
+					}
+					sizes[2 * noDomains - 2] = m;
+					fstVtxSep[2 * noDomains - 2] = 0;
+				}
+				else if (permc_spec != PARMETIS)
+				{
+					/* same as before */
+					printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n",
+						   (int)MYROW(grid->iam, grid), (int)MYCOL(grid->iam, grid));
+				}
+			} /* end ... use parmetis */
+
+			
+			if (permc_spec != MY_PERMC && Fact == DOFACT)
+			{
+				if (permc_spec == PARMETIS)
+				{
+					/* Get column permutation vector in perm_c.                   *
+					 * This routine takes as input the distributed input matrix A *
+					 * and does not modify it.  It also allocates memory for      *
+					 * sizes[] and fstVtxSep[] arrays, that contain information   *
+					 * on the separator tree computed by ParMETIS.                */
+					flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num,
+												 noDomains, &sizes, &fstVtxSep,
+												 grid, &symb_comm);
+					if (flinfo > 0)
+						ABORT("ERROR in get perm_c parmetis.");
+				}
+				else
+				{
+					get_perm_c_dist(iam, permc_spec, &GA, perm_c);
+				}
+			}
+
+			stat->utime[COLPERM] = SuperLU_timer_() - t;
+
+			/* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'
+			   (a.k.a. column etree), depending on the choice of ColPerm.
+			   Adjust perm_c[] to be consistent with a postorder of etree.
+			   Permute columns of A to form A*Pc'. */
+			if (Fact != SamePattern_SameRowPerm)
+			{
+				if (parSymbFact == NO)
+				{
+
+					int_t *GACcolbeg, *GACcolend, *GACrowind;
+
+					sp_colorder(options, &GA, perm_c, etree, &GAC);
+
+					/* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */
+					GACstore = (NCPformat *)GAC.Store;
+					GACcolbeg = GACstore->colbeg;
+					GACcolend = GACstore->colend;
+					GACrowind = GACstore->rowind;
+					for (j = 0; j < n; ++j)
+					{
+						for (i = GACcolbeg[j]; i < GACcolend[j]; ++i)
+						{
+							irow = GACrowind[i];
+							GACrowind[i] = perm_c[irow];
+						}
+					}
+
+					/* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up
+					   the nonzero data structures for L & U. */
+#if (PRNTlevel >= 1)
+					if (!iam)
+						printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n",
+							   sp_ienv_dist(2, options), sp_ienv_dist(3, options), sp_ienv_dist(6, options));
+#endif
+					t = SuperLU_timer_();
+					if (!(Glu_freeable = (Glu_freeable_t *)
+							  SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
+						ABORT("Malloc fails for Glu_freeable.");
+
+					/* Every process does this. */
+					iinfo = symbfact(options, iam, &GAC, perm_c, etree,
+									 Glu_persist, Glu_freeable);
+
+					stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+					if (iinfo < 0)
+					{
+						/* Successful return */
+						QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
+
+#if (PRNTlevel >= 1)
+						if (!iam)
+						{
+							printf("\tNo of supers %ld\n",
+								   (long)Glu_persist->supno[n - 1] + 1);
+							printf("\tSize of G(L) %ld\n", (long)Glu_freeable->xlsub[n]);
+							printf("\tSize of G(U) %ld\n", (long)Glu_freeable->xusub[n]);
+							printf("\tint %lu, short %lu, float %lu, double %lu\n",
+								   sizeof(int_t), sizeof(short),
+								   sizeof(float), sizeof(double));
+							printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n",
+								   symb_mem_usage.for_lu * 1e-6,
+								   symb_mem_usage.total * 1e-6,
+								   symb_mem_usage.expansions);
+						}
+#endif
+					}
+					else
+					{
+						if (!iam)
+						{
+							fprintf(stderr, "symbfact() error returns %d\n",
+									(int)iinfo);
+							exit(-1);
+						}
+					}
+
+				} /* end serial symbolic factorization */
+				else
+				{ /* parallel symbolic factorization */
+					t = SuperLU_timer_();
+					flinfo =
+						symbfact_dist(options, nprocs_num, noDomains,
+									  A, perm_c, perm_r,
+									  sizes, fstVtxSep, &Pslu_freeable,
+									  &(grid->comm), &symb_comm,
+									  &symb_mem_usage);
+					stat->utime[SYMBFAC] = SuperLU_timer_() - t;
+					if (flinfo > 0)
+						ABORT("Insufficient memory for parallel symbolic factorization.");
+				}
+
+				/* Destroy GA */
+				if (parSymbFact == NO || options->RowPerm != NO)
+					Destroy_CompCol_Matrix_dist(&GA);
+				if (parSymbFact == NO)
+					Destroy_CompCol_Permuted_dist(&GAC);
+
+			} /* end if Fact not SamePattern_SameRowPerm */
+
+#if (DEBUGlevel >= 2) // Sherry
+			if (!iam)
+				PrintInt10("perm_c", m, perm_c);
+#endif
+			if (sizes)
+				SUPERLU_FREE(sizes);
+			if (fstVtxSep)
+				SUPERLU_FREE(fstVtxSep);
+			if (symb_comm != MPI_COMM_NULL)
+				MPI_Comm_free(&symb_comm);
+
+			if (parSymbFact == NO || Fact == SamePattern_SameRowPerm)
+			{
+				/* Apply column permutation to the original distributed A */
+				for (j = 0; j < nnz_loc; ++j)
+					colind[j] = perm_c[colind[j]];
+
+				/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+				   NOTE: the row permutation Pc*Pr is applied internally in the
+				   distribution routine. */
+				t = SuperLU_timer_();
+
+				nsupers = getNsupers(n, LUstruct->Glu_persist);
+				int* supernodeMask;
+				if(Fact == SamePattern_SameRowPerm){
+					supernodeMask=trf3Dpartition->supernodeMask;
+					dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct,
+												Glu_freeable, LUstruct, grid, supernodeMask);					
+						
+				}else{
+
+					// First call of pddistribute_allgrid with a prefixed supernodeMask
+					// YL: this first call can be removed with Piyush's cleaner fix 
+					int* supernodeMask = int32Calloc_dist(nsupers);
+					for (int i=0;i<nsupers;i++){
+						if(grid3d->zscp.Iam == i%grid3d->npdep)
+							supernodeMask[i]=1;
+					}
+					dist_mem_use = pddistribute_allgrid_index_only(options, n, A, ScalePermstruct,
+											Glu_freeable, LUstruct, grid, supernodeMask);
+					SUPERLU_FREE(supernodeMask);
+
+					// Generate the 3D partition
+					dDestroy_trf3Dpartition(LUstruct->trf3Dpart);
+					trf3Dpartition = dinitTrf3Dpartition_allgrid(n, options, LUstruct, grid3d);
+					LUstruct->trf3Dpart=trf3Dpartition;
+
+					// Delete the meta data generated by pddistribute_allgrid
+					dLocalLU_t *Llu = LUstruct->Llu;
+					for (int jb = 0; jb < CEILING( nsupers, grid->npcol ); ++jb) { /* for each block column ... */						
+						if ( Llu->Lrowind_bc_ptr[jb] ) {
+							SUPERLU_FREE (Llu->Lrowind_bc_ptr[jb]);
+						}													
+					}
+					SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+					for (int lb = 0; lb < CEILING( nsupers, grid->nprow ); ++lb) { /* for each block row ... */
+						if(Llu->Ufstnz_br_ptr[lb]!=NULL)
+							SUPERLU_FREE(Llu->Ufstnz_br_ptr[lb]);															
+					}
+					SUPERLU_FREE(Llu->Ufstnz_br_ptr);
+	
+
+					// Second call of pddistribute_allgrid with the final supernodeMask   						
+					dist_mem_use = pddistribute_allgrid(options, n, A, ScalePermstruct,
+											Glu_freeable, LUstruct, grid, trf3Dpartition->supernodeMask);
+
+
+					/* now that LU structure has been scattered, initialize the LU and buffers */
+					dinit3DLUstructForest(trf3Dpartition->myTreeIdxs, trf3Dpartition->myZeroTrIdxs,
+										trf3Dpartition->sForests, LUstruct, grid3d);	
+					dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
+					dLluBufInit(LUvsb, LUstruct);
+					trf3Dpartition->LUvsb = LUvsb;
+				}
+
+
+				stat->utime[DIST] = SuperLU_timer_() - t;
+
+				/* Deallocate storage used in symbolic factorization. */
+				if (Fact != SamePattern_SameRowPerm)
+				{
+					iinfo = symbfact_SubFree(Glu_freeable);
+					SUPERLU_FREE(Glu_freeable);
+				}
+
+			}
+			else
+			{
+				/* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage.
+				   NOTE: the row permutation Pc*Pr is applied internally in the
+				   distribution routine. */
+				/* Apply column permutation to the original distributed A */
+				for (j = 0; j < nnz_loc; ++j)
+					colind[j] = perm_c[colind[j]];
+
+				t = SuperLU_timer_();
+				dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct,
+												&Pslu_freeable, LUstruct, grid);
+				if (dist_mem_use > 0)
+					ABORT("Not enough memory available for dist_psymbtonum\n");
+
+				stat->utime[DIST] = SuperLU_timer_() - t;
+
+				ABORT("ddist_psymbtonum does not yet work with 3D factorization\n");
+
+			}
+
+			/*if (!iam) printf ("\tDISTRIBUTE time  %8.2f\n", stat->utime[DIST]); */
+
+
+		/* Perform numerical factorization in parallel on all process layers.*/
+
+		/* nvshmem related. The nvshmem_malloc has to be called before trs_compute_communication_structure, otherwise solve is much slower*/
+		#ifdef HAVE_NVSHMEM  
+			int nc = CEILING( nsupers, grid->npcol);
+			int nr = CEILING( nsupers, grid->nprow);
+			int flag_bc_size = RDMA_FLAG_SIZE * (nc+1);
+			int flag_rd_size = RDMA_FLAG_SIZE * nr * 2;    
+			int my_flag_bc_size = RDMA_FLAG_SIZE * (nc+1);
+			int my_flag_rd_size = RDMA_FLAG_SIZE * nr * 2;
+			int maxrecvsz = sp_ienv_dist(3, options)* nrhs + SUPERLU_MAX( XK_H, LSUM_H );
+			int ready_x_size = maxrecvsz*nc;
+			int ready_lsum_size = 2*maxrecvsz*nr;
+			if (getenv("SUPERLU_ACC_SOLVE")){
+			nv_init_wrapper(grid->comm);
+			prepare_multiGPU_buffers(flag_bc_size,flag_rd_size,ready_x_size,ready_lsum_size,my_flag_bc_size,my_flag_rd_size);
+			}
+		#endif
+
+
+
+
+
+		SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t));
+		SCT_init(SCT);
+
+#if (PRNTlevel >= 1)
+		if (grid3d->iam == 0)
+		{
+			printf("after 3D initialization.\n");
+			fflush(stdout);
+		}
+#endif
+
+
+
+
+
+		t = SuperLU_timer_();
+
+		/*factorize in grid 1*/
+		// if(grid3d->zscp.Iam)
+		// get environment variable TRF3DVERSION
+#ifdef GPU_ACC
+		if (gpu3dVersion == 1)
+		{ /* this is the new C++ code in TRF3dV100/ directory */
+		  
+			if (!grid3d->iam)
+				printf("Using pdgstrf3d+gpu version 1 for Summit\n");
+#if 0
+			pdgstrf3d_summit(options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+				  grid3d, stat, info);
+#else
+			int_t ldt = sp_ienv_dist(3, options); /* Size of maximum supernode */
+			double s_eps = smach_dist("Epsilon");
+			double thresh = s_eps * anorm;
+
+			/* call constructor in C++ code */
+			LUgpu = createLUgpuHandle(nsupers, ldt, trf3Dpartition, LUstruct, grid3d,
+						  SCT, options, stat, thresh, info);
+			
+			/* call pdgstrf3d() in C++ code */
+			pdgstrf3d_LUpackedInterface(LUgpu);
+			
+			copyLUGPU2Host(LUgpu, LUstruct);
+			destroyLUgpuHandle(LUgpu);
+
+			// print other stuff
+			// if (!grid3d->zscp.Iam)
+			// 	SCT_printSummary(grid, SCT);
+			reduceStat(FACT, stat, grid3d);
+
+#endif
+		}
+		else /* this is the old C code, with less GPU offload */
+#endif /* matching ifdef GPU_ACC */
+		{
+
+			pdgstrf3d(options, m, n, anorm, trf3Dpartition, SCT, LUstruct,
+					  grid3d, stat, info);
+		
+			// dDumpLblocks3D(nsupers, grid3d, LUstruct->Glu_persist, LUstruct->Llu);
+		
+		
+		}
+		if (getenv("NEW3DSOLVE")){
+			dbroadcastAncestor3d(trf3Dpartition, LUstruct, grid3d, SCT);
+		}
+
+		if ( options->Fact != SamePattern_SameRowPerm) {
+			if (getenv("NEW3DSOLVE") && Solve3D==true){
+				trs_compute_communication_structure(options, n, LUstruct,
+							ScalePermstruct, trf3Dpartition->supernodeMask, grid, stat);
+			}else{
+				int* supernodeMask = int32Malloc_dist(nsupers);
+				for(int ii=0; ii<nsupers; ii++)
+					supernodeMask[ii]=1;
+				trs_compute_communication_structure(options, n, LUstruct,
+							ScalePermstruct, supernodeMask, grid, stat);
+				SUPERLU_FREE(supernodeMask);
+			}
+		}
+
+
+		stat->utime[FACT] = SuperLU_timer_() - t;
+
+		/*factorize in grid 1*/
+		// if(grid3d->zscp.Iam)
+		double tgather = SuperLU_timer_();
+		if(Solve3D==false){
+		dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT);
+		}
+		SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+		/*print stats for bottom grid*/
+
+		// Write LU to file
+		int writeLU = 0;
+		if (getenv("WRITELU"))
+		{
+			writeLU = atoi(getenv("WRITELU"));
+		}
+
+		if (writeLU)
+		{
+			if (!grid3d->zscp.Iam)
+				writeLUtoDisk(nsupers, Glu_persist->xsup, LUstruct);
+		}
+
+		int checkLU = 0;
+		if (getenv("CHECKLU"))
+		{
+			checkLU = atoi(getenv("CHECKLU"));
+		}
+
+		if (checkLU)
+		{
+			if (!grid3d->zscp.Iam)
+				checkLUFromDisk(nsupers, Glu_persist->xsup, LUstruct);
+		}
+
+#if (PRNTlevel >= 0)
+		if (!grid3d->zscp.Iam)
+		{
+			SCT_print(grid, SCT);
+			SCT_print3D(grid3d, SCT);
+		}
+		SCT_printComm3D(grid3d, SCT);
+
+		/*print memory usage*/
+		d3D_printMemUse(trf3Dpartition, LUstruct, grid3d);
+
+		SCT->gatherLUtimer += SuperLU_timer_() - tgather;
+		/*print stats for bottom grid*/
+		/*print forest weight and costs*/
+		printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
+		/*reduces stat from all the layers*/
+#endif
+
+		SCT_free(SCT);
+
+	} /* end if not Factored ... factor on all process layers */
+
+	if (grid3d->zscp.Iam == 0 )
+	{ // only process layer 0
+		if (!factored)
+		{
+			if (options->PrintStat)
+			{
+				int_t TinyPivots;
+				float for_lu, total, avg, loc_max;
+				float mem_stage[3];
+				struct { float val; int rank; } local_struct, global_struct;
+
+				MPI_Reduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+						   MPI_SUM, 0, grid->comm );
+				stat->TinyPivots = TinyPivots;
+
+				/*-- Compute high watermark of all stages --*/
+				if (parSymbFact == TRUE)
+				{
+					/* The memory used in the redistribution routine
+				   includes the memory used for storing the symbolic
+				   structure and the memory allocated for numerical
+				   factorization */
+					mem_stage[0] = (-flinfo); /* symbfact step */
+					mem_stage[1] = (-dist_mem_use);      /* distribution step */
+					loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1]);
+					if (options->RowPerm != NO )
+						loc_max = SUPERLU_MAX(loc_max, GA_mem_use);
+				}
+				else
+				{
+					mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */
+					mem_stage[1] = symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu;            /* distribution step */
+					loc_max = SUPERLU_MAX(mem_stage[0], mem_stage[1] );
+				}
+
+				dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+				mem_stage[2] = num_mem_usage.total;  /* numerical factorization step */
+
+				loc_max = SUPERLU_MAX(loc_max, mem_stage[2] ); /* local max of 3 stages */
+
+				local_struct.val = loc_max;
+				local_struct.rank = grid->iam;
+				MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm );
+				int all_highmark_rank = global_struct.rank;
+				float all_highmark_mem = global_struct.val * 1e-6;
+
+				MPI_Reduce( &loc_max, &avg,
+						   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+				MPI_Reduce( &num_mem_usage.for_lu, &for_lu,
+						   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+				MPI_Reduce( &num_mem_usage.total, &total,
+						   1, MPI_FLOAT, MPI_SUM, 0, grid->comm );
+
+				/*-- Compute memory usage of numerical factorization --*/
+				local_struct.val = num_mem_usage.for_lu;
+				MPI_Reduce(&local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm);
+				int lu_max_rank = global_struct.rank;
+				float lu_max_mem = global_struct.val * 1e-6;
+				
+				local_struct.val = stat->peak_buffer;
+				MPI_Reduce( &local_struct, &global_struct, 1, MPI_FLOAT_INT, MPI_MAXLOC, 0, grid->comm );
+	        	int buffer_peak_rank = global_struct.rank;
+	        	float buffer_peak = global_struct.val*1e-6;
+				if (iam == 0)
+				{
+					printf("\n** Memory Usage **********************************\n");
+					printf("** Total highmark (MB):\n"
+						   "    Sum-of-all : %8.2f | Avg : %8.2f  | Max : %8.2f\n",
+						   avg * 1e-6,
+						   avg / grid->nprow / grid->npcol * 1e-6,
+						   all_highmark_mem);
+					printf("    Max at rank %d, different stages (MB):\n"
+						   "\t. symbfact        %8.2f\n"
+						   "\t. distribution    %8.2f\n"
+						   "\t. numfact         %8.2f\n",
+						   all_highmark_rank, mem_stage[0] * 1e-6, mem_stage[1] * 1e-6, mem_stage[2] * 1e-6);
+					printf("** NUMfact space (MB): (sum-of-all-processes)\n"
+						   "    L\\U :        %8.2f |  Total : %8.2f\n",
+						   for_lu * 1e-6, total * 1e-6);
+					printf("\t. max at rank %d, max L+U memory (MB): %8.2f\n"
+						   "\t. max at rank %d, peak buffer (MB):    %8.2f\n",
+						   lu_max_rank, lu_max_mem,
+						   buffer_peak_rank, buffer_peak);
+					printf("**************************************************\n\n");
+					printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots);
+					fflush(stdout);
+				}
+			} /* end printing stats */
+
+		} /* end if not Factored */
+    }
+
+		if(Solve3D){
+
+			if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+			/* Need to reset the solve's communication pattern,
+			because perm_r[] and/or perm_c[] is changed.    */
+			if ( options->SolveInitialized == YES ) { /* Initialized before */
+				dSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+				pdgstrs_delete_device_lsum_x(SOLVEstruct);
+				options->SolveInitialized = NO;   /* Reset the solve state */
+			}
+			}
+
+			if (getenv("NEW3DSOLVE")){
+
+
+			if (options->DiagInv == YES && (Fact != FACTORED))
+			{
+				pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+
+				// The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d.
+				int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+#if (defined(GPU_ACC) && defined(GPU_SOLVE))
+
+				pdconvertU(options, grid, LUstruct, stat, n);
+
+				// checkGPU(gpuFree(LUstruct->Llu->d_xsup));
+				// checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_ilsum));
+				// checkGPU(gpuFree(LUstruct->Llu->d_grid));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat));
+
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice));  				
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t)));
+    			// checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice));
+#endif
+if (getenv("SUPERLU_ACC_SOLVE")){
+#ifdef GPU_ACC
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+								   (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+								   (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+								   (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+#endif
+}
+			}
+			}
+		}else{ /* if(Solve3D) */
+
+			if (grid3d->zscp.Iam == 0){  /* on 2D grid-0 */
+
+			if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
+			/* Need to reset the solve's communication pattern,
+			because perm_r[] and/or perm_c[] is changed.    */
+			if ( options->SolveInitialized == YES ) { /* Initialized before */
+				dSolveFinalize(options, SOLVEstruct); /* Clean up structure */
+				pdgstrs_delete_device_lsum_x(SOLVEstruct);
+				options->SolveInitialized = NO;   /* Reset the solve state */
+			}
+			}
+
+#if (defined(GPU_ACC) && defined(GPU_SOLVE))
+			if (options->DiagInv == NO)
+			{
+				if (iam == 0)
+				{
+					printf("!!WARNING: GPU trisolve requires setting options->DiagInv==YES\n");
+					printf("           otherwise, use CPU trisolve\n");
+					fflush(stdout);
+				}
+				// exit(0);  // Sherry: need to return an error flag
+			}
+#endif
+
+			if (options->DiagInv == YES && (Fact != FACTORED))
+			{
+				pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+
+				// The following #ifdef GPU_ACC block frees and reallocates GPU data for trisolve. The data seems to be overwritten by pdgstrf3d.
+				int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+#ifdef GPU_ACC
+
+				pdconvertU(options, grid, LUstruct, stat, n);
+
+				// checkGPU(gpuFree(LUstruct->Llu->d_xsup));
+				// checkGPU(gpuFree(LUstruct->Llu->d_bcols_masked));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LRtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_LBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_URtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_UBtree_ptr));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lrowind_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lindval_loc_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_offset));
+				// checkGPU(gpuFree(LUstruct->Llu->d_ilsum));
+				// checkGPU(gpuFree(LUstruct->Llu->d_grid));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Lnzval_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Linv_bc_dat));
+				// checkGPU(gpuFree(LUstruct->Llu->d_Uinv_bc_dat));
+
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_xsup, (n + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_xsup, LUstruct->Glu_persist->xsup, (n + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_bcols_masked, LUstruct->Llu->bcols_masked, LUstruct->Llu->nbcol_masked * sizeof(int), gpuMemcpyHostToDevice));  					
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LRtree_ptr, LUstruct->Llu->LRtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_LBtree_ptr, LUstruct->Llu->LBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_URtree_ptr, LUstruct->Llu->URtree_ptr, CEILING(nsupers, grid->nprow) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_UBtree_ptr, LUstruct->Llu->UBtree_ptr, CEILING(nsupers, grid->npcol) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_dat, LUstruct->Llu->Lrowind_bc_dat, (LUstruct->Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_dat, LUstruct->Llu->Lindval_loc_bc_dat, (LUstruct->Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lrowind_bc_offset, LUstruct->Llu->Lrowind_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lindval_loc_bc_offset, LUstruct->Llu->Lindval_loc_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_offset, LUstruct->Llu->Lnzval_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_offset, LUstruct->Llu->Linv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_offset, LUstruct->Llu->Uinv_bc_offset, CEILING(nsupers, grid->npcol) * sizeof(long int), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t)));
+				// checkGPU(gpuMemcpy(LUstruct->Llu->d_ilsum, LUstruct->Llu->ilsum, (CEILING(nsupers, grid->nprow) + 1) * sizeof(int_t), gpuMemcpyHostToDevice));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Lnzval_bc_dat, (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Linv_bc_dat, (LUstruct->Llu->Linv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc((void **)&LUstruct->Llu->d_Uinv_bc_dat, (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double)));
+				// checkGPU(gpuMalloc( (void**)&LUstruct->Llu->d_grid, sizeof(gridinfo_t)));
+    			// checkGPU(gpuMemcpy(LUstruct->Llu->d_grid, grid, sizeof(gridinfo_t), gpuMemcpyHostToDevice));
+#endif
+
+if (getenv("SUPERLU_ACC_SOLVE")){
+#ifdef GPU_ACC
+
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
+								   (LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
+								   (LUstruct->Llu->Uinv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+				checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
+								   (LUstruct->Llu->Lnzval_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
+#endif
+}
+			}
+			}
+		}
+
+
+		/* ------------------------------------------------------------
+		   Compute the solution matrix X.
+		   ------------------------------------------------------------ */
+		if ((nrhs > 0) && (*info == 0))
+		{
+		if (options->SolveInitialized == NO){
+			if (getenv("SUPERLU_ACC_SOLVE")){
+			if (getenv("NEW3DSOLVE") && Solve3D==true){
+				pdgstrs_init_device_lsum_x(options, n, m_loc, nrhs, grid,LUstruct, SOLVEstruct,trf3Dpartition->supernodeMask);	
+			}else{
+				int* supernodeMask = int32Malloc_dist(nsupers);
+				for(int ii=0; ii<nsupers; ii++)
+					supernodeMask[ii]=1;
+				pdgstrs_init_device_lsum_x(options, n, m_loc, nrhs, grid,LUstruct, SOLVEstruct,supernodeMask);	
+				SUPERLU_FREE(supernodeMask);
+			}
+			}
+		}
+
+		stat->utime[SOLVE] = 0.0;
+		if(Solve3D){
+
+			// if (!(b_work = doubleMalloc_dist(n)))
+			// 	ABORT("Malloc fails for b_work[]");
+			/* ------------------------------------------------------
+			   Scale the right-hand side if equilibration was performed
+			   ------------------------------------------------------*/
+			if (notran)
+			{
+				if (rowequ)
+				{
+					b_col = B;
+					for (j = 0; j < nrhs; ++j)
+					{
+						irow = fst_row;
+						for (i = 0; i < m_loc; ++i)
+						{
+							b_col[i] *= R[irow];
+							++irow;
+						}
+						b_col += ldb;
+					}
+				}
+			}
+			else if (colequ)
+			{
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				{
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					{
+						b_col[i] *= C[irow];
+						++irow;
+					}
+					b_col += ldb;
+				}
+			}
+
+			/* Save a copy of the right-hand side. */
+			ldx = ldb;
+			if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs)))
+				ABORT("Malloc fails for X[]");
+			x_col = X;
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			{
+				for (i = 0; i < m_loc; ++i)
+					x_col[i] = b_col[i];
+				x_col += ldx;
+				b_col += ldb;
+			}
+
+			/* ------------------------------------------------------
+			   Solve the linear system.
+			   ------------------------------------------------------*/
+		
+			if (options->SolveInitialized == NO)
+			/* First time */
+			/* Inside this routine, SolveInitialized is set to YES.
+			For repeated call to pdgssvx3d(), no need to re-initialilze
+			the Solve data & communication structures, unless a new
+			factorization with Fact == DOFACT or SamePattern is asked for. */
+			{
+				dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct,
+							grid, SOLVEstruct);
+			}			
+			if (getenv("NEW3DSOLVE")){
+				pdgstrs3d_newsolve (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X,
+				m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info);
+			}else{
+				pdgstrs3d (options, n, LUstruct,ScalePermstruct, trf3Dpartition, grid3d, X,
+				m_loc, fst_row, ldb, nrhs,SOLVEstruct, stat, info);
+			}
+			if (options->IterRefine)
+				{
+				/* Improve the solution by iterative refinement. */
+				int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+				t = SuperLU_timer_ ();
+				if (options->RefineInitialized == NO || Fact == DOFACT) {
+					/* All these cases need to re-initialize gsmv structure */
+					if (options->RefineInitialized)
+					pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+					pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+						SOLVEstruct->gsmv_comm);
+
+					/* Save a copy of the transformed local col indices
+					in colind_gsmv[]. */
+					if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+					if (!(it = intMalloc_dist (nnz_loc)))
+					ABORT ("Malloc fails for colind_gsmv[]");
+					colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+					for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+					options->RefineInitialized = YES;
+				}
+				else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+					double at;
+					int_t k, jcol, p;
+					/* Swap to beginning the part of A corresponding to the
+					local part of X, as was done in pdgsmv_init() */
+					for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+					k = rowptr[i];
+					for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+						{
+						jcol = colind[j];
+						p = SOLVEstruct->row_to_proc[jcol];
+						if (p == iam)
+							{	/* Local */
+							at = a[k];
+							a[k] = a[j];
+							a[j] = at;
+							++k;
+							}
+						}
+					}
+
+					/* Re-use the local col indices of A obtained from the
+					previous call to pdgsmv_init() */
+					for (i = 0; i < nnz_loc; ++i)
+					colind[i] = colind_gsmv[i];
+				}
+
+				if (nrhs == 1)
+					{	/* Use the existing solve structure */
+					SOLVEstruct1 = SOLVEstruct;
+					}
+				else {
+				/* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS.
+			So we use SOLVEstruct1 for the refinement step.
+			*/
+					if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+						SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+						ABORT ("Malloc fails for SOLVEstruct1");
+					/* Copy the same stuff */
+					SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+					SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+					SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+					SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+					SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+					SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+					SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+
+					/* Initialize the *gstrs_comm for 1 RHS. */
+					if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+						SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+						ABORT ("Malloc fails for gstrs_comm[]");
+					pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+							Glu_persist, SOLVEstruct1);
+					if (getenv("SUPERLU_ACC_SOLVE")){
+					int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+					pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,trf3Dpartition->supernodeMask);		 
+					}
+					}
+
+				pdgsrfs3d (options, n, A, anorm, LUstruct, ScalePermstruct, grid3d, trf3Dpartition,
+					B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+				/* Deallocate the storage associated with SOLVEstruct1 */
+				if (nrhs > 1)
+					{
+					pdgstrs_delete_device_lsum_x(SOLVEstruct1);
+					pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+					SUPERLU_FREE (SOLVEstruct1);
+					}
+
+				stat->utime[REFINE] = SuperLU_timer_ () - t;
+				} /* end IterRefine */			
+		}else{
+
+			if (grid3d->zscp.Iam == 0){  /* on 2D grid-0 */
+
+			/* ------------------------------------------------------
+			   Scale the right-hand side if equilibration was performed
+			   ------------------------------------------------------*/
+			if (notran)
+			{
+				if (rowequ)
+				{
+					b_col = B;
+					for (j = 0; j < nrhs; ++j)
+					{
+						irow = fst_row;
+						for (i = 0; i < m_loc; ++i)
+						{
+							b_col[i] *= R[irow];
+							++irow;
+						}
+						b_col += ldb;
+					}
+				}
+			}
+			else if (colequ)
+			{
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				{
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					{
+						b_col[i] *= C[irow];
+						++irow;
+					}
+					b_col += ldb;
+				}
+			}
+
+			/* Save a copy of the right-hand side. */
+			ldx = ldb;
+			if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs)))
+				ABORT("Malloc fails for X[]");
+			x_col = X;
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			{
+				for (i = 0; i < m_loc; ++i)
+					x_col[i] = b_col[i];
+				x_col += ldx;
+				b_col += ldb;
+			}
+
+			/* ------------------------------------------------------
+			   Solve the linear system.
+			   ------------------------------------------------------*/
+			if (options->SolveInitialized == NO)
+			/* First time */
+			/* Inside this routine, SolveInitialized is set to YES.
+			For repeated call to pdgssvx3d(), no need to re-initialilze
+			the Solve data & communication structures, unless a new
+			factorization with Fact == DOFACT or SamePattern is asked for. */
+			{
+				dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct,
+							grid, SOLVEstruct);
+			}
+			pdgstrs(options, n, LUstruct, ScalePermstruct, grid, X, m_loc,
+				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+
+			/* ------------------------------------------------------------
+			Use iterative refinement to improve the computed solution and
+			compute error bounds and backward error estimates for it.
+			------------------------------------------------------------ */
+			if (options->IterRefine)
+				{
+				/* Improve the solution by iterative refinement. */
+				int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+				dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+				t = SuperLU_timer_ ();
+				if (options->RefineInitialized == NO || Fact == DOFACT) {
+					/* All these cases need to re-initialize gsmv structure */
+					if (options->RefineInitialized)
+					pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+					pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+						SOLVEstruct->gsmv_comm);
+
+					/* Save a copy of the transformed local col indices
+					in colind_gsmv[]. */
+					if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+					if (!(it = intMalloc_dist (nnz_loc)))
+					ABORT ("Malloc fails for colind_gsmv[]");
+					colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+					for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+					options->RefineInitialized = YES;
+				}
+				else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+					double at;
+					int_t k, jcol, p;
+					/* Swap to beginning the part of A corresponding to the
+					local part of X, as was done in pdgsmv_init() */
+					for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+					k = rowptr[i];
+					for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+						{
+						jcol = colind[j];
+						p = SOLVEstruct->row_to_proc[jcol];
+						if (p == iam)
+							{	/* Local */
+							at = a[k];
+							a[k] = a[j];
+							a[j] = at;
+							++k;
+							}
+						}
+					}
+
+					/* Re-use the local col indices of A obtained from the
+					previous call to pdgsmv_init() */
+					for (i = 0; i < nnz_loc; ++i)
+					colind[i] = colind_gsmv[i];
+				}
+
+				if (nrhs == 1)
+					{	/* Use the existing solve structure */
+					SOLVEstruct1 = SOLVEstruct;
+					}
+				else {
+				/* For nrhs > 1, since refinement is performed for RHS
+			one at a time, the communication structure for pdgstrs
+			is different than the solve with nrhs RHS.
+			So we use SOLVEstruct1 for the refinement step.
+			*/
+					if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+						SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+						ABORT ("Malloc fails for SOLVEstruct1");
+					/* Copy the same stuff */
+					SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+					SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+					SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+					SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+					SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+					SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+					SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+
+					/* Initialize the *gstrs_comm for 1 RHS. */
+					if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+						SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+						ABORT ("Malloc fails for gstrs_comm[]");
+					pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+							Glu_persist, SOLVEstruct1);
+					if (getenv("SUPERLU_ACC_SOLVE")){
+					int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+					int* supernodeMask = int32Malloc_dist(nsupers);
+					for(int ii=0; ii<nsupers; ii++)
+						supernodeMask[ii]=1;
+					pdgstrs_init_device_lsum_x(options, n, m_loc, 1, grid,LUstruct, SOLVEstruct1,supernodeMask);		 
+					SUPERLU_FREE(supernodeMask);
+					}
+					}
+
+				pdgsrfs (options, n, A, anorm, LUstruct, ScalePermstruct, grid,
+					B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+
+				/* Deallocate the storage associated with SOLVEstruct1 */
+				if (nrhs > 1)
+					{
+					pdgstrs_delete_device_lsum_x(SOLVEstruct1);
+					pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+					SUPERLU_FREE (SOLVEstruct1);
+					}
+
+				stat->utime[REFINE] = SuperLU_timer_ () - t;
+				} /* end IterRefine */
+			}
+		}
+
+if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
+	{
+		/* Permute the solution matrix B <= Pc'*X. */
+		pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+					SOLVEstruct->inv_perm_c,
+					X, ldx, B, ldb, nrhs, grid);
+#if ( DEBUGlevel>=2 )
+		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+		for (i = 0; i < m_loc; ++i)
+		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+#endif
+			/* Transform the solution matrix X to a solution of the original
+			   system before the equilibration. */
+			if (notran)
+			{
+				if (colequ)
+				{
+					b_col = B;
+					for (j = 0; j < nrhs; ++j)
+					{
+						irow = fst_row;
+						for (i = 0; i < m_loc; ++i)
+						{
+							b_col[i] *= C[irow];
+							++irow;
+						}
+						b_col += ldb;
+				    }
+			    }
+		    }
+			else if (rowequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				    {
+					b_col[i] *= R[irow];
+					++irow;
+				    }
+				b_col += ldb;
+			    }
+		    }
+
+		// SUPERLU_FREE (b_work);
+	}
+	if (grid3d->zscp.Iam == 0 || Solve3D)
+		SUPERLU_FREE (X);
+
+	} /* end if nrhs > 0 and factor successful */
+
+#if ( PRNTlevel>=1 )
+	if (!grid3d->iam) {
+	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
+#endif
+
+
+	if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
+	/* Deallocate R and/or C if it was not used. */
+	if (Equil && Fact != SamePattern_SameRowPerm)
+	    {
+		switch (ScalePermstruct->DiagScale) {
+		    case NOEQUIL:
+			SUPERLU_FREE (R);
+			SUPERLU_FREE (C);
+			break;
+		    case ROW:
+			SUPERLU_FREE (C);
+			break;
+		    case COL:
+			SUPERLU_FREE (R);
+			break;
+	            default: break;
+		}
+	}
+
+#if 0
+	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
+	    Destroy_CompCol_Permuted_dist (&GAC);
+#endif
+
+	} /* process layer 0 done solve */
+
+	/* Scatter the solution from 2D grid-0 to 3D grid */
+	if (nrhs > 0)
+		dScatter_B3d(A3d, grid3d);
+
+	B = A3d->B3d;		 // B is now assigned back to B3d on return
+	A->Store = Astore3d; // restore Astore to 3D
+
+#if (DEBUGlevel >= 1)
+	CHECK_MALLOC(iam, "Exit pdgssvx3d()");
+#endif
+}
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
index d3094d31..09413734 100755
--- a/SRC/pdutil.c
+++ b/SRC/pdutil.c
@@ -1258,6 +1258,37 @@ void dSolveFinalize(superlu_dist_options_t *options, dSOLVEstruct_t *SOLVEstruct
     }
 } /* dSolveFinalize */
 
+#if 0
+void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0)
+    {
+        SUPERLU_FREE(A2d->rowptr);
+        SUPERLU_FREE(A2d->colind);
+        SUPERLU_FREE(A2d->nzval);
+    }
+    SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE(A3d->nnz_counts_int);
+    SUPERLU_FREE(A3d->nnz_disp);
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    int rankorder = grid3d->rankorder;
+    if (rankorder == 0)
+    { /* Z-major in 3D grid */
+        SUPERLU_FREE(A3d->procs_to_send_list);
+        SUPERLU_FREE(A3d->send_count_list);
+        SUPERLU_FREE(A3d->procs_recv_from_list);
+        SUPERLU_FREE(A3d->recv_count_list);
+    }
+    SUPERLU_FREE(A2d); // free 2D structure
+    SUPERLU_FREE(A3d); // free 3D structure
+} /* dDestroy_A3d_gathered_on_2d */
+
+#else 
 void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
 {
     /* free A2d and B2d, which are allocated on all 2D layers*/
@@ -1283,7 +1314,9 @@ void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
     }
     SUPERLU_FREE( A2d );         // free 2D structure
     SUPERLU_FREE( A3d );         // free 3D structure
-} /* dDestroy_A3d_gathered_on_2d */
+} /* dDestroy_A3d_gathered_on_2d_allgrid */
+#endif
+
 
 
 /*! \brief Check the inf-norm of the error vector
diff --git a/SRC/ssvx3dAux.c b/SRC/ssvx3dAux.c
index b2dcb88f..c9142d91 100644
--- a/SRC/ssvx3dAux.c
+++ b/SRC/ssvx3dAux.c
@@ -243,7 +243,9 @@ void findRowPerm_MC64(gridinfo_t* grid, int_t job,
                       double* R1,
                       double* C1,
                       int_t* iinfo) {
+    #if ( DEBUGlevel>=1 )                    
     LOG_FUNC_ENTER();
+    #endif
     // Check input parameters
     if (colptr == NULL || rowind == NULL || a_GA == NULL || 
         perm_r == NULL ) {
@@ -297,7 +299,9 @@ void scale_distributed_matrix(int_t rowequ, int_t colequ, int_t m, int_t n,
  int_t m_loc, int_t *rowptr, int_t *colind, int_t fst_row, double *a,
   double *R, double *C, double *R1, double *C1) 
 {
-    printf("\033[1;32mEntering function scale_distributed_matrix at %s:%d\033[0m\n", __FILE__, __LINE__);
+    #if ( DEBUGlevel>=1 )                    
+    LOG_FUNC_ENTER();
+    #endif    
     // Scale the row and column factors
     for (int i = 0; i < n; ++i) {
         R1[i] = exp(R1[i]);
@@ -503,7 +507,9 @@ void perform_row_permutation(
     int_t *colequ,
     int_t *iinfo)
 {
+    #if ( DEBUGlevel>=1 )                    
     LOG_FUNC_ENTER();
+    #endif
     int_t *perm_r = ScalePermstruct->perm_r;
     /* Get NC format data from SuperMatrix GA */
     NCformat* GAstore = (NCformat *)GA->Store;
@@ -648,7 +654,9 @@ void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatri
 						   superlu_dist_mem_usage_t*symb_mem_usage,
 						   gridinfo3d_t* grid3d)
 {
+    #if ( DEBUGlevel>=1 )                    
     LOG_FUNC_ENTER();
+    #endif
     SuperMatrix GAC; /* Global A in NCP format */
     NCPformat *GACstore;
     int_t *GACcolbeg, *GACcolend, *GACrowind, irow;