From 8fa7e0d45c3de0d41694f08a7657d4169caca90e Mon Sep 17 00:00:00 2001 From: Gamal Sallam Date: Thu, 12 Oct 2023 15:54:00 -0700 Subject: [PATCH] Enable Perf Trampoline Pre-fork Compile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: [**Motivation**](https://docs.google.com/document/d/1XpBXcBz2VMGBIu9QrLO-6znTIrHWIn8HLI6zfLJ9ang/edit?usp=sharing) The perf-trampoline rolled out and enabled on a small number of boxes right now. The main blocker to enabling it globally is that it’s a ~1.5% gCPU regression. Most of the regression comes from us having to update the /tmp/perf-.map files. We can mitigate this cost by following the JIT model, and generating the trampolines during bootstrap. Here are the proposed steps for implementation: **During Bootstrap:** - Maintain a set of code objects (named `perf_trampoline_reg_units`), akin to `jit_reg_units`, but intended for items not included in the JIT list. - Create a new function (`compile_perf_trampoline_entries`) that iterates through `perf_trampoline_reg_units` and invokes _`PyPerfTrampoline_CompileCode` for each entry. - Call `compile_perf_trampoline_entries` during bootstrap. **Pre-fork Stage:** - Check if pre-fork perf-trampoline compilation is enabled. If so: - Copy the perfmap file from the parent's perf-map to the child's perf-map. - Refrain from re-initializing the perf-trampoline in the child process to prevent redundant recompilation. - Modify the JIT code to bypass copying perf-map files if the pre-fork perf-trampoline compilation is activated. **Other** - I added an env variable (`PERFTRAMPOLINEPREFORKCOMPILATION`) and a flag `perf_trampoline_prefork_compilation` to enable this feature at the process level. **We need to decide which part should be upstreamed.** Reviewed By: czardoz Differential Revision: D48612681 fbshipit-source-id: 9e1eb884328d453cfadaf0f2a0ac3f05af3996f4 --- CinderX/known-core-python-exported-symbols | 3 + Include/ceval.h | 3 + Include/sysmodule.h | 2 + Jit/config.h | 1 + Jit/perf_jitdump.cpp | 10 ++ Jit/pyjit.cpp | 39 +++++++ Jit/pyjit.h | 3 + Lib/test/test_perf_profiler_precompile.py | 120 +++++++++++++++++++++ Modules/cinder.c | 22 ++++ Objects/perf_trampoline.c | 48 ++++++++- Python/sysmodule.c | 35 ++++++ Tools/scripts/no_cinderx_skip_tests.txt | 1 + 12 files changed, 282 insertions(+), 5 deletions(-) create mode 100644 Lib/test/test_perf_profiler_precompile.py diff --git a/CinderX/known-core-python-exported-symbols b/CinderX/known-core-python-exported-symbols index 52f67d3ea43..0ca91a170d4 100644 --- a/CinderX/known-core-python-exported-symbols +++ b/CinderX/known-core-python-exported-symbols @@ -1963,9 +1963,12 @@ _PyUnicode_XStrip _PyUnion_Type _Py_union_type_or Py_UniversalNewlineFgets +PyUnstable_CopyPerfMapFile PyUnstable_GC_VisitObjects PyUnstable_PerfMapState_Fini PyUnstable_PerfMapState_Init +PyUnstable_PerfTrampoline_CompileCode +PyUnstable_PerfTrampoline_SetPersistAfterFork PyUnstable_Type_AssignVersionTag PyUnstable_WritePerfMapEntry _Py_UTF8_Edit_Cost diff --git a/Include/ceval.h b/Include/ceval.h index 0f687666e2b..79844580da2 100644 --- a/Include/ceval.h +++ b/Include/ceval.h @@ -129,6 +129,9 @@ Py_DEPRECATED(3.2) PyAPI_FUNC(void) PyEval_ReleaseLock(void); PyAPI_FUNC(void) PyEval_AcquireThread(PyThreadState *tstate); PyAPI_FUNC(void) PyEval_ReleaseThread(PyThreadState *tstate); +PyAPI_FUNC(int) PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *); +PyAPI_FUNC(int) PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable); + #define Py_BEGIN_ALLOW_THREADS { \ PyThreadState *_save; \ _save = PyEval_SaveThread(); diff --git a/Include/sysmodule.h b/Include/sysmodule.h index b6f29d1a1d0..77eb04f9c79 100644 --- a/Include/sysmodule.h +++ b/Include/sysmodule.h @@ -40,6 +40,8 @@ PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void); PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name); PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void); +PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename); + #endif #ifndef Py_LIMITED_API diff --git a/Jit/config.h b/Jit/config.h index e9b50ac8ea1..65021d54a41 100644 --- a/Jit/config.h +++ b/Jit/config.h @@ -38,6 +38,7 @@ struct Config { uint32_t attr_cache_size{1}; uint32_t auto_jit_threshold{0}; uint32_t auto_jit_profile_threshold{0}; + bool compile_perf_trampoline_prefork{false}; }; // Get the JIT's current config object. diff --git a/Jit/perf_jitdump.cpp b/Jit/perf_jitdump.cpp index b74ec26e9c2..33803b9901a 100644 --- a/Jit/perf_jitdump.cpp +++ b/Jit/perf_jitdump.cpp @@ -398,6 +398,16 @@ void copyFileInfo(FileInfo& info) { info = {}; if (parent_filename.starts_with("/tmp/perf-") && + parent_filename.ends_with(".map") && + _PyPerfTrampoline_IsPreforkCompilationEnabled()) { + JIT_LOG( + "File %s has already been copied to %s by the perf trampoline, " + "skipping copy.", + parent_filename, + child_filename); + return; + } else if ( + parent_filename.starts_with("/tmp/perf-") && parent_filename.ends_with(".map") && isPerfTrampolineActive()) { if (!copyJitEntries(parent_filename)) { JIT_LOG( diff --git a/Jit/pyjit.cpp b/Jit/pyjit.cpp index 02b34703d11..95b8f281865 100644 --- a/Jit/pyjit.cpp +++ b/Jit/pyjit.cpp @@ -78,6 +78,10 @@ static JITList* g_jit_list{nullptr}; // Function and code objects ("units") registered for compilation. static std::unordered_set> jit_reg_units; +// Function and code objects ("units") registered for pre-fork perf-trampoline +// compilation. +static std::unordered_set> perf_trampoline_reg_units; + // Only set during preloading. Used to keep track of functions that were // deleted as a side effect of preloading. using UnitDeletedCallback = std::function; @@ -641,6 +645,12 @@ void initFlagProcessor() { xarg_flag_processor.addOption( "jit-help", "", jit_help, "print all available JIT flags and exits"); + + xarg_flag_processor.addOption( + "perf-trampoline-prefork-compilation", + "PERFTRAMPOLINEPREFORKCOMPILATION", + getMutableConfig().compile_perf_trampoline_prefork, + "Compile perf trampoline pre-fork"); } xarg_flag_processor.setFlags(PySys_GetXOptions()); @@ -688,6 +698,19 @@ static void compile_worker_thread() { JIT_DLOG("Finished compile worker in thread %d", std::this_thread::get_id()); } +static void compile_perf_trampoline_entries() { + for (const auto& unit : perf_trampoline_reg_units) { + if (PyFunction_Check(unit)) { + PyFunctionObject* func = (PyFunctionObject*)unit.get(); + if (PyUnstable_PerfTrampoline_CompileCode( + reinterpret_cast(func->func_code)) == -1) { + JIT_LOG("Failed to compile perf trampoline entry"); + } + } + } + perf_trampoline_reg_units.clear(); +} + static void multithread_compile_all() { JIT_CHECK(jit_ctx, "JIT not initialized"); @@ -1916,6 +1939,9 @@ int _PyJIT_RegisterFunction(PyFunctionObject* func) { } if (!_PyJIT_IsEnabled()) { + if (_PyPerfTrampoline_IsPreforkCompilationEnabled()) { + perf_trampoline_reg_units.emplace(reinterpret_cast(func)); + } return 0; } @@ -1926,6 +1952,8 @@ int _PyJIT_RegisterFunction(PyFunctionObject* func) { if (_PyJIT_OnJitList(func)) { jit_reg_units.emplace(reinterpret_cast(func)); result = 1; + } else if (_PyPerfTrampoline_IsPreforkCompilationEnabled()) { + perf_trampoline_reg_units.emplace(reinterpret_cast(func)); } // If we have an active jit-list, scan this function's code object for any @@ -2480,3 +2508,14 @@ void _PyJIT_SetProfileNewInterpThreads(int enabled) { int _PyJIT_GetProfileNewInterpThreads(void) { return profile_new_interp_threads; } + +int _PyPerfTrampoline_IsPreforkCompilationEnabled() { + return getConfig().compile_perf_trampoline_prefork; +} + +void _PyPerfTrampoline_CompilePerfTrampolinePreFork(void) { + if (_PyPerfTrampoline_IsPreforkCompilationEnabled()) { + PyUnstable_PerfTrampoline_SetPersistAfterFork(1); + compile_perf_trampoline_entries(); + } +} diff --git a/Jit/pyjit.h b/Jit/pyjit.h index d795a3514c7..af951407cd9 100644 --- a/Jit/pyjit.h +++ b/Jit/pyjit.h @@ -431,6 +431,9 @@ PyAPI_FUNC(int) _PyJIT_IsDisassemblySyntaxIntel(void); PyAPI_FUNC(void) _PyJIT_SetProfileNewInterpThreads(int); PyAPI_FUNC(int) _PyJIT_GetProfileNewInterpThreads(void); +PyAPI_FUNC(int) _PyPerfTrampoline_IsPreforkCompilationEnabled(void); +PyAPI_FUNC(void) _PyPerfTrampoline_CompilePerfTrampolinePreFork(void); + #ifdef __cplusplus } // extern "C" #endif diff --git a/Lib/test/test_perf_profiler_precompile.py b/Lib/test/test_perf_profiler_precompile.py new file mode 100644 index 00000000000..44921b62b6d --- /dev/null +++ b/Lib/test/test_perf_profiler_precompile.py @@ -0,0 +1,120 @@ +import pathlib +import subprocess +import sys +import sysconfig +import unittest + +from test.support.os_helper import temp_dir +from test.support.script_helper import assert_python_ok, make_script + +try: + from cinder import _is_compile_perf_trampoline_pre_fork_enabled +except: + raise unittest.SkipTest("pre-fork perf-trampoline compilation is not enabled") + + +def supports_trampoline_profiling(): + perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE") + if not perf_trampoline: + return False + return int(perf_trampoline) == 1 + + +if not supports_trampoline_profiling(): + raise unittest.SkipTest("perf trampoline profiling not supported") + +if not _is_compile_perf_trampoline_pre_fork_enabled(): + raise unittest.SkipTest("pre-fork perf-trampoline compilation is not enabled") + + +class TestPerfTrampolinePreCompile(unittest.TestCase): + def setUp(self): + super().setUp() + self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map")) + + def tearDown(self) -> None: + super().tearDown() + files_to_delete = ( + set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files + ) + for file in files_to_delete: + file.unlink() + + def test_trampoline_works(self): + code = """if 1: + import sys + import os + import sysconfig + from cinder import _compile_perf_trampoline_pre_fork + + def foo_fork(): + pass + def bar_fork(): + foo_fork() + def baz_fork(): + bar_fork() + + + def foo(): + pass + def bar(): + foo() + def baz(): + bar() + + if __name__ == "__main__": + _compile_perf_trampoline_pre_fork() + pid = os.fork() + if pid == 0: + print(os.getpid()) + baz_fork() + else: + baz() + """ + rc, out, err = assert_python_ok("-c", code) + with temp_dir() as script_dir: + script = make_script(script_dir, "perftest", code) + with subprocess.Popen( + [ + sys.executable, + "-X", + "perf-trampoline-prefork-compilation", + "-X", + "perf", + script, + ], + universal_newlines=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) as process: + stdout, stderr = process.communicate() + self.assertNotIn("Error:", stderr) + child_pid = int(stdout.strip()) + perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map") + perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map") + self.assertTrue(perf_file.exists()) + self.assertTrue(perf_child_file.exists()) + + perf_file_contents = perf_file.read_text() + self.assertIn(f"py::foo:{script}", perf_file_contents) + self.assertIn(f"py::bar:{script}", perf_file_contents) + self.assertIn(f"py::baz:{script}", perf_file_contents) + self.assertIn(f"py::foo_fork:{script}", perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", perf_file_contents) + self.assertIn(f"py::baz_fork:{script}", perf_file_contents) + + child_perf_file_contents = perf_child_file.read_text() + self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents) + self.assertIn(f"py::baz_fork:{script}", child_perf_file_contents) + + # For pre-compiled entries, the entries of a forked process should + # appear exactly the same in both the parent and child processes. + perf_file_lines = perf_file_contents.split("\n") + for line in perf_file_lines: + if ( + f"py::foo_fork:{script}" in line + or f"py::bar_fork:{script}" in line + or f"py::baz_fork:{script}" in line + ): + self.assertIn(line, child_perf_file_contents) diff --git a/Modules/cinder.c b/Modules/cinder.c index 3453e8a4e13..f92ab274866 100644 --- a/Modules/cinder.c +++ b/Modules/cinder.c @@ -656,6 +656,20 @@ get_awaiter_frame(PyObject *self, PyObject *Py_UNUSED(args)) { } } +static PyObject* +compile_perf_trampoline_pre_fork(PyObject *self, PyObject *Py_UNUSED(args)) { + _PyPerfTrampoline_CompilePerfTrampolinePreFork(); + Py_RETURN_NONE; +} + +static PyObject* +is_compile_perf_trampoline_pre_fork_enabled(PyObject *self, PyObject *Py_UNUSED(args)) { + if(_PyPerfTrampoline_IsPreforkCompilationEnabled()) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; +} + static struct PyMethodDef cinder_module_methods[] = { {"debug_break", cinder_debug_break, @@ -779,6 +793,14 @@ static struct PyMethodDef cinder_module_methods[] = { get_awaiter_frame, METH_NOARGS, "Get the awaiter frame of the current executing task"}, + {"_compile_perf_trampoline_pre_fork", + compile_perf_trampoline_pre_fork, + METH_NOARGS, + "Compile perf-trampoline entries before forking"}, + {"_is_compile_perf_trampoline_pre_fork_enabled", + is_compile_perf_trampoline_pre_fork_enabled, + METH_NOARGS, + "Return whether compile perf-trampoline entries before fork is enabled or not"}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c index cb6e64f3698..58e0f60d1c7 100644 --- a/Objects/perf_trampoline.c +++ b/Objects/perf_trampoline.c @@ -124,6 +124,7 @@ any DWARF information available for them). #include "pycore_ceval.h" #include "frameobject.h" #include "pycore_interp.h" +#include "Jit/pyjit.h" typedef enum { PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state @@ -180,6 +181,7 @@ static perf_status_t perf_status = PERF_STATUS_NO_INIT; static Py_ssize_t extra_code_index = -1; static code_arena_t *code_arena; static trampoline_api_t trampoline_api; +static Py_ssize_t persist_after_fork = 0; static void perf_map_write_entry(void *state, const void *code_addr, @@ -301,6 +303,24 @@ compile_trampoline(void) return code_arena_new_code(code_arena); } +int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co) +{ + py_trampoline f = NULL; + assert(extra_code_index != -1); + int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); + if (ret != 0 || f == NULL) { + py_trampoline new_trampoline = compile_trampoline(); + if (new_trampoline == NULL) { + return 0; + } + trampoline_api.write_state(trampoline_api.state, new_trampoline, + code_arena->code_size, co); + return _PyCode_SetExtra((PyObject *)co, extra_code_index, + (void *)new_trampoline); + } + return 0; +} + static PyObject * py_trampoline_evaluator(PyThreadState *ts, PyFrameObject *frame, int throw) @@ -422,16 +442,34 @@ _PyPerfTrampoline_Fini(void) return 0; } +int +PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){ +#ifdef PY_HAVE_PERF_TRAMPOLINE + persist_after_fork = enable; + return persist_after_fork; +#endif + return 0; +} + PyStatus _PyPerfTrampoline_AfterFork_Child(void) { #ifdef PY_HAVE_PERF_TRAMPOLINE - // Restart trampoline in file in child. - int was_active = _PyIsPerfTrampolineActive(); - _PyPerfTrampoline_Fini(); PyUnstable_PerfMapState_Fini(); - if (was_active) { - _PyPerfTrampoline_Init(1); + if (persist_after_fork){ + char filename[256]; + pid_t parent_pid = getppid(); + snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid); + if(PyUnstable_CopyPerfMapFile(filename) != 0){ + return PyStatus_Error("Failed to copy perf map file."); + } + } else { + // Restart trampoline in file in child. + int was_active = _PyIsPerfTrampolineActive(); + _PyPerfTrampoline_Fini(); + if (was_active) { + _PyPerfTrampoline_Init(1); + } } #endif return PyStatus_Ok(); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index ccd67203e70..09ddbd3fe9b 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2175,6 +2175,41 @@ PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void) { #endif } +PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename) { +#ifndef MS_WINDOWS + FILE* from = fopen(parent_filename, "r"); + if (!from) { + return -1; + } + if (perf_map_state.perf_map == NULL) { + int ret = PyUnstable_PerfMapState_Init(); + if(ret != 0){ + return ret; + } + } + char buf[4096]; + PyThread_acquire_lock(perf_map_state.map_lock, 1); + while (1) { + size_t bytes_read = fread(buf, 1, sizeof(buf), from); + size_t bytes_written = fwrite(buf, 1, bytes_read, perf_map_state.perf_map); + fflush(perf_map_state.perf_map); + + if (bytes_read < sizeof(buf) && feof(from)) { + fclose(from); + PyThread_release_lock(perf_map_state.map_lock); + return 0; + } + if (bytes_read == 0 || bytes_written < bytes_read) { + fclose(from); + PyThread_release_lock(perf_map_state.map_lock); + return -1; + } + } +#endif + return 0; +} + + #ifdef __cplusplus } #endif diff --git a/Tools/scripts/no_cinderx_skip_tests.txt b/Tools/scripts/no_cinderx_skip_tests.txt index 024a0379943..af8eaf85c93 100644 --- a/Tools/scripts/no_cinderx_skip_tests.txt +++ b/Tools/scripts/no_cinderx_skip_tests.txt @@ -12,5 +12,6 @@ test_compiler_sbs_stdlib_6 test_compiler_sbs_stdlib_7 test_compiler_sbs_stdlib_8 test_compiler_sbs_stdlib_9 +test_perf_profiler_precompile.py test_shadowcode test_strictmodule