resolve merge conflicts

xiaoyeli · Oct 24, 2023 · 28d78c4 · 28d78c4
2 parents a1d2cc1 + e371977
commit 28d78c4
Show file tree

Hide file tree

Showing 12 changed files with 243 additions and 28 deletions.
diff --git a/SRC/TRF3dV100/schurCompUpdate.cu b/SRC/TRF3dV100/schurCompUpdate.cu
@@ -940,7 +940,7 @@ int_t LUstruct_v100::dSchurCompUpdatePartGPU(
     double alpha = 1.0;
     double beta = 0.0;
 #ifndef NDEBUG
-    printf("m=%d, n=%d, k=%d\n", gemm_m, gemm_n, gemm_k);
+   // printf("m=%d, n=%d, k=%d\n", gemm_m, gemm_n, gemm_k);
 #endif
     cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
                 gemm_m, gemm_n, gemm_k, &alpha,
@@ -1088,6 +1088,8 @@ int_t LUstruct_v100::setLUstruct_GPU()
 
     size_t totalNzvalSize = 0; /* too big for gemmBufferSize */
     size_t max_gemmCsize = 0;  /* Sherry added 2/20/2023 */
+    size_t max_nzrow = 0;  /* Yang added 10/20/2023 */
+    size_t max_nzcol = 0;  
 
     /*Memory for lpapenl and upanel Data*/
     for (i = 0; i < CEILING(nsupers, Pc); ++i)
@@ -1096,6 +1098,8 @@ int_t LUstruct_v100::setLUstruct_GPU()
         {
             memReqData += lPanelVec[i].totalSize();
             totalNzvalSize += lPanelVec[i].nzvalSize();
+            if(lPanelVec[i].nzvalSize()>0)
+                max_nzrow = SUPERLU_MAX(lPanelVec[i].nzrows(),max_nzrow);
 	    //max_gemmCsize = SUPERoLU_MAX(max_gemmCsize, ???);
         }
     }
@@ -1105,8 +1109,11 @@ int_t LUstruct_v100::setLUstruct_GPU()
         {
             memReqData += uPanelVec[i].totalSize();
             totalNzvalSize += uPanelVec[i].nzvalSize();
+            if(uPanelVec[i].nzvalSize()>0)
+                max_nzcol = SUPERLU_MAX(uPanelVec[i].nzcols(),max_nzcol);
         }
     }
+    max_gemmCsize = max_nzcol*max_nzrow;
 
     memReqData += CEILING(nsupers, Pc) * sizeof(lpanelGPU_t);
     memReqData += CEILING(nsupers, Pr) * sizeof(upanelGPU_t);
@@ -1118,8 +1125,11 @@ int_t LUstruct_v100::setLUstruct_GPU()
     int_t maxBuffSize = sp_ienv_dist (8, options);
     int maxsup = sp_ienv_dist(3, options); // max. supernode size
     maxBuffSize = SUPERLU_MAX(maxsup * maxsup, maxBuffSize); // Sherry added 7/10/23
-    A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, totalNzvalSize);
-
+ #if 0   
+    A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, totalNzvalSize); 
+ #else 
+    A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, SUPERLU_MAX(max_gemmCsize,totalNzvalSize)); /* Yang added 10/20/2023 */
+ #endif    
     size_t dataPerStream = 3 * sizeof(double) * maxLvalCount + 3 * sizeof(double) * maxUvalCount + 2 * sizeof(int_t) * maxLidxCount + 2 * sizeof(int_t) * maxUidxCount + A_gpu.gemmBufferSize * sizeof(double) + ldt * ldt * sizeof(double);
     if (memReqData + 2 * dataPerStream > useableGPUMem)
     {

diff --git a/SRC/double/dSchCompUdt-gpu.c b/SRC/double/dSchCompUdt-gpu.c
@@ -216,7 +216,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
 		    size_t C_stream_size = nbrow * num_col_stream * sizeof(double);
 
 		    // Sherry: Check dC buffer of *buffer_size* is large enough
-		    assert(nbrow*(st_col+num_col_stream) < buffer_size);
+		    assert(nbrow*(st_col+num_col_stream) <= buffer_size);
 
 		    gpuMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
 		    		    gpuMemcpyHostToDevice, streams[stream_id]);

diff --git a/SRC/double/dlook_ahead_update.c b/SRC/double/dlook_ahead_update.c
@@ -121,9 +121,12 @@ while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
     /* Sherry -- examine all the shared variables ??
        'firstprivate' ensures that the private variables are initialized
        to the values before entering the loop.  */
+#if !defined __INTEL_LLVM_COMPILER 
+/* Yang: this parallel for is causing segfault for one-api compilers*/    
 #pragma omp parallel for \
-    firstprivate(lptr,luptr,ib,current_b) private(lb) \
+    firstprivate(lptr,luptr,current_b) private(ib,lb) \
     default(shared) schedule(dynamic)
+#endif
 #endif
     for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
         int temp_nbrow; /* automatic variable is private */

diff --git a/SRC/double/pdgssvx.c b/SRC/double/pdgssvx.c
@@ -2362,7 +2362,6 @@ void dpacked2skyline(int_t k, int_t *usubpack, double *valpack, int_t *usub, dou
 
         usubPtr += UB_DESCRIPTOR + gsupc;
     }   
-    return 0;
 }
 
 

diff --git a/SRC/include/superlu_ddefs.h b/SRC/include/superlu_ddefs.h
@@ -1575,7 +1575,16 @@ extern void pdconvert_flatten_skyline2UROWDATA(superlu_dist_options_t *options,
 	   dLUstruct_t *LUstruct, SuperLUStat_t *stat, int n);
 extern void pdconvertUROWDATA2skyline(superlu_dist_options_t *options, gridinfo_t *grid,
 	   dLUstruct_t *LUstruct, SuperLUStat_t *stat, int n);
-
+extern int_t
+dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
+                Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
+                gridinfo_t *grid, int_t *colptr[], int_t *rowind[],
+                double *a[]);
+extern float
+pddistribute3d_Yang(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
+	     dScalePermstruct_t *ScalePermstruct,
+	     Glu_freeable_t *Glu_freeable, dLUstruct_t *LUstruct,
+	     gridinfo3d_t *grid3d);       
 #if 0 // NOT CALLED
 /* from ancFactorization.h (not called) */
 extern int_t ancestorFactor(

diff --git a/SRC/include/superlu_defs.h b/SRC/include/superlu_defs.h
@@ -1461,6 +1461,13 @@ extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *);
 extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *);
 extern int_t LDiagBlockRecvWait( int_t k, int* factored_U, MPI_Request *, gridinfo_t *);
 
+
+extern int_t num_full_cols_U_mod(
+    int_t kk, int_t *usub, int_t *xsup,
+    gridinfo_t *grid, int_t *perm_u,
+    int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */
+);
+
 /*=====================*/
 
 #ifdef __cplusplus

diff --git a/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem.sh b/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem.sh
@@ -25,7 +25,7 @@ export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
 
 export SUPERLU_LBS=GD  
 export SUPERLU_ACC_OFFLOAD=1 # this can be 0 to do CPU tests on GPU nodes
-export GPU3DVERSION=1
+export GPU3DVERSION=0
 export ANC25D=0
 export NEW3DSOLVE=1    
 export NEW3DSOLVETREECOMM=1
@@ -74,8 +74,8 @@ else
   # Host unknown; exiting
   exit $EXIT_HOST
 fi
-nprows=(2)
-npcols=(2 )
+nprows=(1)
+npcols=(1 )
 npz=(1)
 nrhs=(1)
 NTH=1
@@ -123,14 +123,16 @@ export MPICH_MAX_THREAD_SAFETY=multiple
 
 # export NSUP=256
 # export NREL=256
-for MAT in big.rua
+# for MAT in big.rua
+# for MAT in Geo_1438.bin
 # for MAT in g20.rua
 # for MAT in s1_mat_0_253872.bin s2D9pt2048.rua
 # for MAT in dielFilterV3real.bin
-# for MAT in rma10.mtx
+# for MAT in rma10.mtx 
+# for MAT in raefsky3.mtx
 # for MAT in s2D9pt2048.rua raefsky3.mtx rma10.mtx
 # for MAT in s1_mat_0_126936.bin  # for MAT in s1_mat_0_126936.bin
-# for MAT in s2D9pt2048.rua
+for MAT in s2D9pt2048.rua
 # for MAT in s2D9pt1536.rua
 # for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin
 # for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
@@ -140,16 +142,23 @@ do
 mkdir -p $MAT
 for ii in `seq 1 $NREP`
 do	
-# export SUPERLU_ACC_SOLVE=1
 
-# export SUPERLU_ACC_OFFLOAD=1
+# SUPERLU_ACC_OFFLOAD=0
 # srun -n $NCORE_VAL_TOT2D -N $NODE_VAL2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
-# export SUPERLU_ACC_OFFLOAD=0
+
+# SUPERLU_ACC_OFFLOAD=1
 # srun -n $NCORE_VAL_TOT2D -N $NODE_VAL2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
 
-unset SUPERLU_ACC_SOLVE
-echo "srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}"
-srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}
+# SUPERLU_ACC_OFFLOAD=1
+# export GPU3DVERSION=0
+# echo "srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
+# srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}
+
+SUPERLU_ACC_OFFLOAD=1
+export GPU3DVERSION=1
+echo "srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
+srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}
+
 
 # export SUPERLU_ACC_SOLVE=1
 # srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores valgrind --leak-check=yes ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}

diff --git a/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_nvidia_nvshmem.sh b/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_nvidia_nvshmem.sh
@@ -27,7 +27,7 @@ export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
 
 export SUPERLU_LBS=GD  
 export SUPERLU_ACC_OFFLOAD=1 # this can be 0 to do CPU tests on GPU nodes
-export GPU3DVERSION=1
+export GPU3DVERSION=0
 export ANC25D=0
 export NEW3DSOLVE=1    
 export NEW3DSOLVETREECOMM=1
@@ -81,8 +81,8 @@ fi
 # npz=(64 32 16)
 # nrhs=(1 50) 
 
-nprows=(2 )
-npcols=(2 )
+nprows=(1 )
+npcols=(1 )
 npz=(1 )
 nrhs=(1)
 
@@ -138,8 +138,8 @@ export MPICH_MAX_THREAD_SAFETY=multiple
 # for MAT in s1_mat_0_253872.bin s2D9pt2048.rua
 # for MAT in dielFilterV3real.bin
 # for MAT in Geo_1438.bin s2D9pt2048.rua raefsky3.mtx rma10.mtx
-# for MAT in Geo_1438.bin 
-for MAT in s1_mat_0_126936.bin
+for MAT in Geo_1438.bin 
+# for MAT in s1_mat_0_126936.bin
 # for MAT in s2D9pt2048.rua
 # for MAT in s2D9pt1536.rua
 # for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin
@@ -153,7 +153,7 @@ do
 # export SUPERLU_ACC_SOLVE=1
 
 
-# # srun -n $NCORE_VAL_TOT2D -N $NODE_VAL2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
+# # # srun -n $NCORE_VAL_TOT2D -N $NODE_VAL2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
 # export SUPERLU_ACC_OFFLOAD=0
 # srun -n $NCORE_VAL_TOT2D -N $NODE_VAL2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
 

diff --git a/example_scripts/batch_script_mpi_runit_sunspot_intel_nogpu.sh b/example_scripts/batch_script_mpi_runit_sunspot_intel_nogpu.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+module load spack cmake
+ulimit -s unlimited
+
+
+#SUPERLU settings:
+export SUPERLU_LBS=GD  
+export SUPERLU_ACC_OFFLOAD=0 # this can be 0 to do CPU tests on GPU nodes
+export GPU3DVERSION=1
+export ANC25D=0
+export NEW3DSOLVE=1    
+export NEW3DSOLVETREECOMM=1
+export SUPERLU_BIND_MPI_GPU=1 # assign GPU based on the MPI rank, assuming one MPI per GPU
+
+export SUPERLU_MAXSUP=256 # max supernode size
+export SUPERLU_RELAX=64  # upper bound for relaxed supernode size
+export SUPERLU_MAX_BUFFER_SIZE=10000000 ## 500000000 # buffer size in words on GPU
+export SUPERLU_NUM_LOOKAHEADS=2   ##4, must be at least 2, see 'lookahead winSize'
+export SUPERLU_NUM_GPU_STREAMS=1
+export SUPERLU_MPI_PROCESS_PER_GPU=1 # 2: this can better saturate GPU
+export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU
+
+
+
+
+CPUS_PER_NODE=104
+THREADS_PER_NODE=208
+nprows=(1)
+npcols=(1 )
+npz=(1)
+nrhs=(1)
+NTH=2
+NREP=1
+# NODE_VAL_TOT=1
+
+for ((i = 0; i < ${#npcols[@]}; i++)); do
+NROW=${nprows[i]}
+NCOL=${npcols[i]}
+NPZ=${npz[i]}
+for ((s = 0; s < ${#nrhs[@]}; s++)); do
+NRHS=${nrhs[s]}
+CORE_VAL2D=`expr $NCOL \* $NROW`
+NODE_VAL2D=`expr $CORE_VAL2D / $CPUS_PER_NODE`
+MOD_VAL=`expr $CORE_VAL2D % $CPUS_PER_NODE`
+if [[ $MOD_VAL -ne 0 ]]
+then
+  NODE_VAL2D=`expr $NODE_VAL2D + 1`
+fi
+
+CORE_VAL=`expr $NCOL \* $NROW \* $NPZ`
+NODE_VAL=`expr $CORE_VAL / $CPUS_PER_NODE`
+MOD_VAL=`expr $CORE_VAL % $CPUS_PER_NODE`
+if [[ $MOD_VAL -ne 0 ]]
+then
+  NODE_VAL=`expr $NODE_VAL + 1`
+fi
+
+# NODE_VAL=2
+# NCORE_VAL_TOT=`expr $NODE_VAL_TOT \* $CORES_PER_NODE / $NTH`
+batch=0 # whether to do batched test
+NCORE_VAL_TOT=`expr $NROW \* $NCOL \* $NPZ `
+NCORE_VAL_TOT2D=`expr $NROW \* $NCOL `
+
+OMP_NUM_THREADS=$NTH
+
+export OMP_NUM_THREADS=$NTH
+export OMP_PLACES=threads
+export OMP_PROC_BIND=spread
+export MPICH_MAX_THREAD_SAFETY=multiple
+#export OMP_MAX_ACTIVE_LEVELS=1
+#export OMP_DYNAMIC=TRUE
+
+# srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/g20.rua
+
+# export NSUP=256
+# export NREL=256
+# for MAT in big.rua
+# for MAT in g20.rua
+# for MAT in s1_mat_0_253872.bin s2D9pt2048.rua
+# for MAT in dielFilterV3real.bin
+for MAT in rma10.mtx
+# for MAT in s2D9pt2048.rua raefsky3.mtx rma10.mtx
+# for MAT in s1_mat_0_126936.bin  # for MAT in s1_mat_0_126936.bin
+# for MAT in s2D9pt2048.rua
+# for MAT in s2D9pt1536.rua
+# for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin
+# for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
+# for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx
+# for MAT in temp_13k.mtx
+do
+mkdir -p $MAT
+for ii in `seq 1 $NREP`
+do	
+
+SUPERLU_ACC_SOLVE=0
+
+mpirun -n $NCORE_VAL_TOT2D --depth $NTH --cpu-bind depth ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch ~/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d
+
+#mpirun -n $NCORE_VAL_TOT  --depth $NTH --cpu-bind depth ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS ~/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}
+
+
+done
+
+done
+done
+done
+
+
+
+
+
+
+
+
+
diff --git a/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh b/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh
@@ -42,9 +42,9 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH//\/usr\/local\/cuda-11.7\/compat:/}
 NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
 #NVSHMEM_HOME=${CRAY_NVIDIA_PREFIX}/comm_libs/nvshmem/
 cmake .. \
-  -DCMAKE_C_FLAGS="-DGPU_SOLVE -std=c11 -DPRNTlevel=0 -DPROFlevel=0 -DDEBUGlevel=0 -DAdd_" \
-  -DCMAKE_CXX_FLAGS="" \
-  -DCMAKE_Fortran_FLAGS="" \
+  -DCMAKE_C_FLAGS="-O2 -DGPU_SOLVE -std=c11 -DPRNTlevel=0 -DPROFlevel=0 -DDEBUGlevel=0 -DAdd_" \
+  -DCMAKE_CXX_FLAGS="-O2" \
+  -DCMAKE_Fortran_FLAGS="-O2" \
   -DCMAKE_CXX_COMPILER=CC \
   -DCMAKE_C_COMPILER=cc \
   -DCMAKE_Fortran_COMPILER=ftn \

diff --git a/example_scripts/run_cmake_build_sunspot_oneAPI_mkl_cpu.sh b/example_scripts/run_cmake_build_sunspot_oneAPI_mkl_cpu.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+module load spack cmake
+
+
+cmake .. \
+  -DCMAKE_C_FLAGS="-DGPU_SOLVE -std=c11 -D_XOPEN_SOURCE -DPRNTlevel=0 -DPROFlevel=0 -DDEBUGlevel=0 -DAdd_ -I${MKLROOT}/include -fopenmp" \
+  -DCMAKE_CXX_FLAGS="-I${MKLROOT}/include -fopenmp" \
+  -DCMAKE_CXX_COMPILER=mpicxx \
+  -DCMAKE_C_COMPILER=mpicc \
+  -DCMAKE_Fortran_COMPILER=mpif90 \
+  -DXSDK_ENABLE_Fortran=OFF \
+  -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
+  -DTPL_ENABLE_LAPACKLIB=ON \
+  -DBUILD_SHARED_LIBS=ON \
+  -DTPL_ENABLE_CUDALIB=OFF \
+  -DCMAKE_INSTALL_PREFIX=. \
+  -DCMAKE_INSTALL_LIBDIR=./lib \
+  -DCMAKE_BUILD_TYPE=Debug \
+  -DTPL_BLAS_LIBRARIES="-L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" \
+  -DTPL_LAPACK_LIBRARIES="-L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core" \
+  -DTPL_PARMETIS_INCLUDE_DIRS="/home/liuyangz/my_software/parmetis-4.0.3/include;/home/liuyangz/my_software/parmetis-4.0.3/metis/include" \
+  -DTPL_PARMETIS_LIBRARIES="/home/liuyangz/my_software/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.so;/home/liuyangz/my_software/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.so" \
+  -DTPL_ENABLE_COMBBLASLIB=OFF \
+  -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON 
+
+make pddrive -j16
+make pddrive3d -j16
+#make f_pddrive
+
+## -DTPL_BLAS_LIBRARIES=/global/cfs/cdirs/m3894/ptlin/tpl/amd_blis/install/amd_blis-20211021-n9-gcc9.3.0/lib/libblis.a \
-Original file line number
+Diff line change
@@ Expand Up @@
             usubPtr += UB_DESCRIPTOR + gsupc;
         }
-        return 0;
     }
@@ Expand Down @@