Skip to content

Commit

Permalink
Enable Papi high level stats within the iterate construct
Browse files Browse the repository at this point in the history
  • Loading branch information
vidsinghal committed Oct 9, 2023
1 parent 209a90d commit 04bea56
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 53 deletions.
10 changes: 9 additions & 1 deletion gibbon-compiler/src/Gibbon/Compiler.hs
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ compileRTS Config{verbosity,optc,dynflags} = do
++ (if pointer then " POINTER=1 " else "")
++ (if parallel then " PARALLEL=1 " else "")
++ (if bumpAlloc then " BUMPALLOC=1 " else "")
++ (if papi then " PAPI=1 " else "")
++ (" USER_CFLAGS=\"" ++ optc ++ "\"")
++ (" VERBOSITY=" ++ show verbosity)
execCmd
Expand All @@ -392,6 +393,7 @@ compileRTS Config{verbosity,optc,dynflags} = do
rts_debug = gopt Opt_RtsDebug dynflags
print_gc_stats = gopt Opt_PrintGcStats dynflags
genGC = gopt Opt_GenGc dynflags
papi = gopt Opt_PapiInstrumentation dynflags


-- | Compile and run the generated code if appropriate
Expand Down Expand Up @@ -426,6 +428,10 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp
links = if pointer
then " -lgc -lm "
else " -lm "
papi = gopt Opt_PapiInstrumentation (dynflags cfg)
links' = if papi
then links ++ "-l:libpapi.a "
else links
compile_program = do
compileRTS cfg
lib_dir <- getRTSBuildDir
Expand All @@ -436,7 +442,7 @@ compileAndRunExe cfg@Config{backend,arrayInput,benchInput,mode,cfile,exefile} fp
++" -L" ++ lib_dir
++ " -Wl,-rpath=" ++ lib_dir ++ " "
++ outfile ++ " " ++ rts_o_path
++ links ++ " -lgibbon_rts_ng"
++ links' ++ " -lgibbon_rts_ng"

execCmd
Nothing
Expand Down Expand Up @@ -524,6 +530,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 "
++ (if not genGC then " -D_GIBBON_GENGC=0 " else " -D_GIBBON_GENGC=1 ")
++ (if simpleWriteBarrier then " -D_GIBBON_SIMPLE_WRITE_BARRIER=1 " else " -D_GIBBON_SIMPLE_WRITE_BARRIER=0 ")
++ (if lazyPromote then " -D_GIBBON_EAGER_PROMOTION=0 " else " -D_GIBBON_EAGER_PROMOTION=1 ")
++ (if papi then " -D_GIBBON_ENABLE_PAPI " else "")
where dflags = dynflags config
bumpAlloc = gopt Opt_BumpAlloc dflags
pointer = gopt Opt_Pointer dflags
Expand All @@ -534,6 +541,7 @@ compilationCmd C config = (cc config) ++" -std=gnu11 "
genGC = gopt Opt_GenGc dflags
simpleWriteBarrier = gopt Opt_SimpleWriteBarrier dflags
lazyPromote = gopt Opt_NoEagerPromote dflags
papi = gopt Opt_PapiInstrumentation dflags

-- |
isBench :: Mode -> Bool
Expand Down
61 changes: 32 additions & 29 deletions gibbon-compiler/src/Gibbon/DynFlags.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,36 @@ import Data.Set as S
import Options.Applicative

data GeneralFlag
= Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions
| Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions
| Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections
| Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies
| Opt_InfiniteRegions -- ^ Use infinite regions
| Opt_BigInfiniteRegions -- ^ Use big infinite regions
| Opt_BenchPrint -- ^ Should the benchamrked function have its output printed?
| Opt_Packed -- ^ Use packed representation
| Opt_Pointer -- ^ Use pointer representation
| Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend
| Opt_Warnc -- ^ Show warnings from the C compiler
| Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen).
| Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization)
| Opt_Fusion -- ^ Enable fusion.
| Opt_Parallel -- ^ Fork/join parallelism.
| Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal.
| Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon.
| Opt_RelativeOffsets -- ^ Enable relative offsets.
| Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism.
| Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated.
| Opt_RtsDebug -- ^ Compile the RTS in debugging mode.
| Opt_PrintGcStats -- ^ Record and print GC statistics.
| Opt_GenGc -- ^ Use the new non-generational GC.
| Opt_NoEagerPromote -- ^ Disable eager promotion.
| Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization.
| Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally
| Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally
| Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types.
= Opt_Gibbon1 -- ^ Set Opt_No_RemoveCopies & Opt_BigInfiniteRegions
| Opt_Gibbon2 -- ^ Set Opt_RemoveCopies & Opt_InfiniteRegions
| Opt_RemoveCopies -- ^ Calls to copy functions are converted to indirections
| Opt_No_RemoveCopies -- ^ Unset Opt_RemoveCopies
| Opt_InfiniteRegions -- ^ Use infinite regions
| Opt_BigInfiniteRegions -- ^ Use big infinite regions
| Opt_BenchPrint -- ^ Should the benchamrked function have its output printed?
| Opt_Packed -- ^ Use packed representation
| Opt_Pointer -- ^ Use pointer representation
| Opt_BumpAlloc -- ^ Use bump-pointer allocation if using the non-packed backend
| Opt_Warnc -- ^ Show warnings from the C compiler
| Opt_DisableGC -- ^ Don't run the the garbage collector (used by Codegen).
| Opt_No_PureAnnot -- ^ Don't use 'pure' annotations (a GCC optimization)
| Opt_Fusion -- ^ Enable fusion.
| Opt_Parallel -- ^ Fork/join parallelism.
| Opt_RegionOnSpawn -- ^ Allocate into fresh regions for every spawn, not steal.
| Opt_GhcTc -- ^ Typecheck with GHC before compiling with Gibbon.
| Opt_RelativeOffsets -- ^ Enable relative offsets.
| Opt_CountParRegions -- ^ Count and print the number of regions allocated for parallelism.
| Opt_CountAllRegions -- ^ Count and print the number of all the regions allocated.
| Opt_RtsDebug -- ^ Compile the RTS in debugging mode.
| Opt_PrintGcStats -- ^ Record and print GC statistics.
| Opt_GenGc -- ^ Use the new non-generational GC.
| Opt_NoEagerPromote -- ^ Disable eager promotion.
| Opt_SimpleWriteBarrier -- ^ Disables eliminate-indirection-chains optimization.
| Opt_Layout_Local -- ^ Optimize the layout of Algebraic data types locally
| Opt_Layout_Global -- ^ Optimize the layout of Algebraic data types globally
| Opt_Layout_Use_Solver -- ^ Use the Solver to optimize the layout of the data types.
| Opt_PapiInstrumentation -- ^ Enable PAPI instrumentation while compiling the gibbon binary.

deriving (Show,Read,Eq,Ord)

-- | Exactly like GHC's ddump flags.
Expand Down Expand Up @@ -120,7 +122,8 @@ dynflagsParser = DynFlags <$> (S.fromList <$> many gflagsParser) <*> (S.fromList
flag' Opt_SimpleWriteBarrier (long "simple-write-barrier" <> help "Disables eliminate-indirection-chains optimization.") <|>
flag' Opt_Layout_Local (long "opt-layout-local" <> help "Optimizes the Layout of Algebraic data types locally") <|>
flag' Opt_Layout_Global (long "opt-layout-global" <> help "Optimizes the Layout of Algebraic data types globally") <|>
flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic")
flag' Opt_Layout_Use_Solver (long "opt-layout-use-solver" <> help "Use the solver instead of a Greedy Heuristic") <|>
flag' Opt_PapiInstrumentation (long "enable-papi" <> help "Enable instrumentation using papi, extends the iterate timing function." )


dflagsParser :: Parser DebugFlag
Expand Down
42 changes: 35 additions & 7 deletions gibbon-compiler/src/Gibbon/Passes/Codegen.hs
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ codegenProg cfg prg@(Prog info_tbl sym_tbl funs mtal) =
\#include <cilk/cilk.h>\n\
\#include <cilk/cilk_api.h>\n\
\#endif\n\n\
\#ifdef _GIBBON_ENABLE_PAPI\n\
\#include <papi.h>\n\
\#endif\n\n\
\/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\
\ * Program starts here\n\
\ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\
Expand Down Expand Up @@ -629,6 +632,8 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps =
selftimed <- gensym "selftimed"
times <- gensym "times"
tmp <- gensym "tmp"
papi_retval <- gensym "papi_retval"
papi_region <- gensym "papi_region"
let ident = case bnds of
((v,_):_) -> v
_ -> (toVar "")
Expand Down Expand Up @@ -659,13 +664,36 @@ codegenTail venv fenv sort_fns (LetTimedT flg bnds rhs body) ty sync_deps =
, C.BlockStm [cstm| printf("itertime: %lf\n", $id:itertime); |]
, C.BlockStm [cstm| gib_vector_inplace_update($id:times, $id:iters, &($id:itertime)); |]
]
in [ C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body } |]
, C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |]
, C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |]
, C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |]
, C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |]
, C.BlockStm [cstm| gib_print_timing_array($id:times); |]
, C.BlockStm [cstm| gib_vector_free($id:times); |]
-- TODO: Find a better way to get a name for the region id.
ifdef = "#ifdef _GIBBON_ENABLE_PAPI"
endif = "#endif"
body' = [ C.BlockStm [cstm| $escstm:ifdef |]
, C.BlockStm [cstm| sprintf($id:papi_region, "%d", get_papi_region_id());|]
, C.BlockDecl [cdecl| int $id:papi_retval = PAPI_hl_region_begin($id:papi_region);|]
, C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) {
exit(1);
} |]
, C.BlockStm [cstm| $escstm:endif |]
] ++
body ++
[ C.BlockStm [cstm| $escstm:ifdef |]
, C.BlockStm [cstm| $id:papi_retval = PAPI_hl_region_end($id:papi_region);|]
, C.BlockStm [cstm| if ( $id:papi_retval != PAPI_OK ) {
exit(1);
} |]
, C.BlockStm [cstm| increment_papi_region_id(); |]
, C.BlockStm [cstm| $escstm:endif |]
]
in [ C.BlockStm [cstm| $escstm:ifdef |]
, C.BlockDecl [cdecl| char $id:papi_region[128];|]
, C.BlockStm [cstm| $escstm:endif |]
, C.BlockStm [cstm| for (long long $id:iters = 0; $id:iters < gib_get_iters_param(); $id:iters ++) { $items:body' } |]
, C.BlockStm [cstm| gib_vector_inplace_sort($id:times, gib_compare_doubles); |]
, C.BlockDecl [cdecl| double *$id:tmp = (double*) gib_vector_nth($id:times, (gib_get_iters_param() / 2)); |]
, C.BlockDecl [cdecl| double $id:selftimed = *($id:tmp); |]
, C.BlockDecl [cdecl| double $id:batchtime = gib_sum_timing_array($id:times); |]
, C.BlockStm [cstm| gib_print_timing_array($id:times); |]
, C.BlockStm [cstm| gib_vector_free($id:times); |]
])

-- else
Expand Down
8 changes: 6 additions & 2 deletions gibbon-rts/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# POINTER
# PARALLEL
# BUMPALLOC
# PAPI
#
#
# GC toggles:
Expand Down Expand Up @@ -69,6 +70,10 @@ ifeq ($(POINTER), 1)
CFLAGS += -D_GIBBON_POINTER
endif

ifeq ($(PAPI), 1)
CFLAGS += -D_GIBBON_ENABLE_PAPI
endif

ifeq ($(PARALLEL), 1)
CFLAGS += -fcilkplus -D_GIBBON_PARALLEL
endif
Expand Down Expand Up @@ -111,7 +116,6 @@ RUST_RTS_SO := libgibbon_rts_ng.so
RUST_RTS_PATH := $(RUST_RTS_DIR)/target/$(MODE)/$(RUST_RTS_SO)
RUST_SOURCES := $(shell find $(RUST_RTS_DIR) -type f -name *.rs)


all: rts

rts: c_rts rs_rts
Expand Down Expand Up @@ -146,7 +150,7 @@ $(C_RTS_DIR)/%.o: $(C_RTS_DIR)/%.c

$(BUILD_DIR)/%.h: $(C_RTS_DIR)/%.h
mkdir -p $(BUILD_DIR) && \
ln -s $^ $@
ln -s -f $^ $@

$(BUILD_DIR):
mkdir -p $(BUILD_DIR)
Expand Down
17 changes: 15 additions & 2 deletions gibbon-rts/rts-c/gibbon_rts.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
#include <cilk/cilk_api.h>
#endif



#ifdef _GIBBON_ENABLE_PAPI
#include <papi.h>
#endif

/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Globals and their accessors
Expand All @@ -61,6 +62,8 @@ static int64_t gib_global_region_count = 0;
// Invariant: should always be equal to max(sym_table_keys).
static GibSym gib_global_gensym_counter = 0;

//PAPI: specify the region to instrument
static uint64_t papi_region_id = 0;


size_t gib_get_biginf_init_chunk_size(void)
Expand Down Expand Up @@ -128,6 +131,16 @@ GibSym gib_read_gensym_counter(void)
return gib_global_gensym_counter;
}

uint64_t get_papi_region_id(void)
{
return papi_region_id;
}

void increment_papi_region_id(void)
{
papi_region_id++;
}


/* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Allocators
Expand Down
27 changes: 15 additions & 12 deletions gibbon-rts/rts-c/gibbon_rts.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,19 @@
* CPP macros used in the RTS:
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* _GIBBON_VERBOSITY=int verbosity level for debug output
* _GIBBON_DEBUG enables various assertions if present
* _GIBBON_GCSTATS collect GC statistics if present
* _GIBBON_PRINT_GCSTATS print GC statistics if present
* _GIBBON_GENGC only use old reference counted GC set to 0
* _GIBBON_BOUNDSCHECK boundscheck vector accesses
* _GIBBON_BUMPALLOC_LISTS bump allocated linked lists
* _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc
* _GIBBON_POINTER pointer mode gib_alloc
* _GIBBON_PARALLEL parallel mode
* _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0
* _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization
* _GIBBON_VERBOSITY=int verbosity level for debug output
* _GIBBON_DEBUG enables various assertions if present
* _GIBBON_GCSTATS collect GC statistics if present
* _GIBBON_PRINT_GCSTATS print GC statistics if present
* _GIBBON_GENGC only use old reference counted GC set to 0
* _GIBBON_BOUNDSCHECK boundscheck vector accesses
* _GIBBON_BUMPALLOC_LISTS bump allocated linked lists
* _GIBBON_BUMPALLOC_HEAP bump allocated gib_alloc
* _GIBBON_POINTER pointer mode gib_alloc
* _GIBBON_PARALLEL parallel mode
* _GIBBON_EAGER_PROMOTION disable eager promotion if set to 0
* _GIBBON_SIMPLE_WRITE_BARRIER disable eliminate-indirection-chains optimization
* _GIBBON_ENABLE_PAPI enable instrumentation via papi
*
*/

Expand Down Expand Up @@ -116,6 +117,8 @@ char *gib_read_bench_prog_param(void);
char *gib_read_benchfile_param(void);
char *gib_read_arrayfile_param(void);
uint64_t gib_read_arrayfile_length_param(void);
uint64_t get_papi_region_id(void);
void increment_papi_region_id(void);

// Number of regions allocated.
int64_t gib_read_region_count(void);
Expand Down

0 comments on commit 04bea56

Please sign in to comment.