Skip to content

Commit

Permalink
Enable Perf Trampoline Pre-fork Compile
Browse files Browse the repository at this point in the history
Summary:
[**Motivation**](https://docs.google.com/document/d/1XpBXcBz2VMGBIu9QrLO-6znTIrHWIn8HLI6zfLJ9ang/edit?usp=sharing)
The perf-trampoline rolled out and enabled on a small number of boxes right now. The main blocker to enabling it globally is that it’s a ~1.5% gCPU regression. Most of the regression comes from us having to update the /tmp/perf-<pid>.map files. We can mitigate this cost by following the JIT model, and generating the trampolines during bootstrap.

Here are the proposed steps for implementation:

**During Bootstrap:**

- Maintain a set of code objects (named `perf_trampoline_reg_units`), akin to `jit_reg_units`, but intended for items not included in the JIT list.
- Create a new function (`compile_perf_trampoline_entries`) that iterates through `perf_trampoline_reg_units` and invokes _`PyPerfTrampoline_CompileCode` for each entry.
- Call `compile_perf_trampoline_entries` during bootstrap.

**Pre-fork Stage:**

- Check if pre-fork perf-trampoline compilation is enabled. If so:
   - Copy the perfmap file from the parent's perf-map to the child's perf-map.
   - Refrain from re-initializing the perf-trampoline in the child process to prevent redundant recompilation.
    - Modify the JIT code to bypass copying perf-map files if the pre-fork perf-trampoline compilation is activated.

**Other**
- I added an env variable (`PERFTRAMPOLINEPREFORKCOMPILATION`) and a flag `perf_trampoline_prefork_compilation` to enable this feature at the process level.

**We need to decide which part should be upstreamed.**

Reviewed By: czardoz

Differential Revision: D48612681

fbshipit-source-id: 9e1eb884328d453cfadaf0f2a0ac3f05af3996f4
  • Loading branch information
Gamal Sallam authored and facebook-github-bot committed Oct 12, 2023
1 parent b0a8b5e commit 8fa7e0d
Show file tree
Hide file tree
Showing 12 changed files with 282 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CinderX/known-core-python-exported-symbols
Original file line number Diff line number Diff line change
Expand Up @@ -1963,9 +1963,12 @@ _PyUnicode_XStrip
_PyUnion_Type
_Py_union_type_or
Py_UniversalNewlineFgets
PyUnstable_CopyPerfMapFile
PyUnstable_GC_VisitObjects
PyUnstable_PerfMapState_Fini
PyUnstable_PerfMapState_Init
PyUnstable_PerfTrampoline_CompileCode
PyUnstable_PerfTrampoline_SetPersistAfterFork
PyUnstable_Type_AssignVersionTag
PyUnstable_WritePerfMapEntry
_Py_UTF8_Edit_Cost
Expand Down
3 changes: 3 additions & 0 deletions Include/ceval.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ Py_DEPRECATED(3.2) PyAPI_FUNC(void) PyEval_ReleaseLock(void);
PyAPI_FUNC(void) PyEval_AcquireThread(PyThreadState *tstate);
PyAPI_FUNC(void) PyEval_ReleaseThread(PyThreadState *tstate);

PyAPI_FUNC(int) PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *);
PyAPI_FUNC(int) PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable);

#define Py_BEGIN_ALLOW_THREADS { \
PyThreadState *_save; \
_save = PyEval_SaveThread();
Expand Down
2 changes: 2 additions & 0 deletions Include/sysmodule.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void);
PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name);

PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void);
PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename);

#endif

#ifndef Py_LIMITED_API
Expand Down
1 change: 1 addition & 0 deletions Jit/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct Config {
uint32_t attr_cache_size{1};
uint32_t auto_jit_threshold{0};
uint32_t auto_jit_profile_threshold{0};
bool compile_perf_trampoline_prefork{false};
};

// Get the JIT's current config object.
Expand Down
10 changes: 10 additions & 0 deletions Jit/perf_jitdump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,16 @@ void copyFileInfo(FileInfo& info) {
info = {};

if (parent_filename.starts_with("/tmp/perf-") &&
parent_filename.ends_with(".map") &&
_PyPerfTrampoline_IsPreforkCompilationEnabled()) {
JIT_LOG(
"File %s has already been copied to %s by the perf trampoline, "
"skipping copy.",
parent_filename,
child_filename);
return;
} else if (
parent_filename.starts_with("/tmp/perf-") &&
parent_filename.ends_with(".map") && isPerfTrampolineActive()) {
if (!copyJitEntries(parent_filename)) {
JIT_LOG(
Expand Down
39 changes: 39 additions & 0 deletions Jit/pyjit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ static JITList* g_jit_list{nullptr};
// Function and code objects ("units") registered for compilation.
static std::unordered_set<BorrowedRef<>> jit_reg_units;

// Function and code objects ("units") registered for pre-fork perf-trampoline
// compilation.
static std::unordered_set<BorrowedRef<>> perf_trampoline_reg_units;

// Only set during preloading. Used to keep track of functions that were
// deleted as a side effect of preloading.
using UnitDeletedCallback = std::function<void(PyObject*)>;
Expand Down Expand Up @@ -641,6 +645,12 @@ void initFlagProcessor() {

xarg_flag_processor.addOption(
"jit-help", "", jit_help, "print all available JIT flags and exits");

xarg_flag_processor.addOption(
"perf-trampoline-prefork-compilation",
"PERFTRAMPOLINEPREFORKCOMPILATION",
getMutableConfig().compile_perf_trampoline_prefork,
"Compile perf trampoline pre-fork");
}

xarg_flag_processor.setFlags(PySys_GetXOptions());
Expand Down Expand Up @@ -688,6 +698,19 @@ static void compile_worker_thread() {
JIT_DLOG("Finished compile worker in thread %d", std::this_thread::get_id());
}

static void compile_perf_trampoline_entries() {
for (const auto& unit : perf_trampoline_reg_units) {
if (PyFunction_Check(unit)) {
PyFunctionObject* func = (PyFunctionObject*)unit.get();
if (PyUnstable_PerfTrampoline_CompileCode(
reinterpret_cast<PyCodeObject*>(func->func_code)) == -1) {
JIT_LOG("Failed to compile perf trampoline entry");
}
}
}
perf_trampoline_reg_units.clear();
}

static void multithread_compile_all() {
JIT_CHECK(jit_ctx, "JIT not initialized");

Expand Down Expand Up @@ -1916,6 +1939,9 @@ int _PyJIT_RegisterFunction(PyFunctionObject* func) {
}

if (!_PyJIT_IsEnabled()) {
if (_PyPerfTrampoline_IsPreforkCompilationEnabled()) {
perf_trampoline_reg_units.emplace(reinterpret_cast<PyObject*>(func));
}
return 0;
}

Expand All @@ -1926,6 +1952,8 @@ int _PyJIT_RegisterFunction(PyFunctionObject* func) {
if (_PyJIT_OnJitList(func)) {
jit_reg_units.emplace(reinterpret_cast<PyObject*>(func));
result = 1;
} else if (_PyPerfTrampoline_IsPreforkCompilationEnabled()) {
perf_trampoline_reg_units.emplace(reinterpret_cast<PyObject*>(func));
}

// If we have an active jit-list, scan this function's code object for any
Expand Down Expand Up @@ -2480,3 +2508,14 @@ void _PyJIT_SetProfileNewInterpThreads(int enabled) {
int _PyJIT_GetProfileNewInterpThreads(void) {
return profile_new_interp_threads;
}

int _PyPerfTrampoline_IsPreforkCompilationEnabled() {
return getConfig().compile_perf_trampoline_prefork;
}

void _PyPerfTrampoline_CompilePerfTrampolinePreFork(void) {
if (_PyPerfTrampoline_IsPreforkCompilationEnabled()) {
PyUnstable_PerfTrampoline_SetPersistAfterFork(1);
compile_perf_trampoline_entries();
}
}
3 changes: 3 additions & 0 deletions Jit/pyjit.h
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,9 @@ PyAPI_FUNC(int) _PyJIT_IsDisassemblySyntaxIntel(void);
PyAPI_FUNC(void) _PyJIT_SetProfileNewInterpThreads(int);
PyAPI_FUNC(int) _PyJIT_GetProfileNewInterpThreads(void);

PyAPI_FUNC(int) _PyPerfTrampoline_IsPreforkCompilationEnabled(void);
PyAPI_FUNC(void) _PyPerfTrampoline_CompilePerfTrampolinePreFork(void);

#ifdef __cplusplus
} // extern "C"
#endif
Expand Down
120 changes: 120 additions & 0 deletions Lib/test/test_perf_profiler_precompile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import pathlib
import subprocess
import sys
import sysconfig
import unittest

from test.support.os_helper import temp_dir
from test.support.script_helper import assert_python_ok, make_script

try:
from cinder import _is_compile_perf_trampoline_pre_fork_enabled
except:
raise unittest.SkipTest("pre-fork perf-trampoline compilation is not enabled")


def supports_trampoline_profiling():
perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE")
if not perf_trampoline:
return False
return int(perf_trampoline) == 1


if not supports_trampoline_profiling():
raise unittest.SkipTest("perf trampoline profiling not supported")

if not _is_compile_perf_trampoline_pre_fork_enabled():
raise unittest.SkipTest("pre-fork perf-trampoline compilation is not enabled")


class TestPerfTrampolinePreCompile(unittest.TestCase):
def setUp(self):
super().setUp()
self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))

def tearDown(self) -> None:
super().tearDown()
files_to_delete = (
set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
)
for file in files_to_delete:
file.unlink()

def test_trampoline_works(self):
code = """if 1:
import sys
import os
import sysconfig
from cinder import _compile_perf_trampoline_pre_fork
def foo_fork():
pass
def bar_fork():
foo_fork()
def baz_fork():
bar_fork()
def foo():
pass
def bar():
foo()
def baz():
bar()
if __name__ == "__main__":
_compile_perf_trampoline_pre_fork()
pid = os.fork()
if pid == 0:
print(os.getpid())
baz_fork()
else:
baz()
"""
rc, out, err = assert_python_ok("-c", code)
with temp_dir() as script_dir:
script = make_script(script_dir, "perftest", code)
with subprocess.Popen(
[
sys.executable,
"-X",
"perf-trampoline-prefork-compilation",
"-X",
"perf",
script,
],
universal_newlines=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
) as process:
stdout, stderr = process.communicate()
self.assertNotIn("Error:", stderr)
child_pid = int(stdout.strip())
perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map")
self.assertTrue(perf_file.exists())
self.assertTrue(perf_child_file.exists())

perf_file_contents = perf_file.read_text()
self.assertIn(f"py::foo:{script}", perf_file_contents)
self.assertIn(f"py::bar:{script}", perf_file_contents)
self.assertIn(f"py::baz:{script}", perf_file_contents)
self.assertIn(f"py::foo_fork:{script}", perf_file_contents)
self.assertIn(f"py::bar_fork:{script}", perf_file_contents)
self.assertIn(f"py::baz_fork:{script}", perf_file_contents)

child_perf_file_contents = perf_child_file.read_text()
self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents)
self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents)
self.assertIn(f"py::baz_fork:{script}", child_perf_file_contents)

# For pre-compiled entries, the entries of a forked process should
# appear exactly the same in both the parent and child processes.
perf_file_lines = perf_file_contents.split("\n")
for line in perf_file_lines:
if (
f"py::foo_fork:{script}" in line
or f"py::bar_fork:{script}" in line
or f"py::baz_fork:{script}" in line
):
self.assertIn(line, child_perf_file_contents)
22 changes: 22 additions & 0 deletions Modules/cinder.c
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,20 @@ get_awaiter_frame(PyObject *self, PyObject *Py_UNUSED(args)) {
}
}

static PyObject*
compile_perf_trampoline_pre_fork(PyObject *self, PyObject *Py_UNUSED(args)) {
_PyPerfTrampoline_CompilePerfTrampolinePreFork();
Py_RETURN_NONE;
}

static PyObject*
is_compile_perf_trampoline_pre_fork_enabled(PyObject *self, PyObject *Py_UNUSED(args)) {
if(_PyPerfTrampoline_IsPreforkCompilationEnabled()) {
Py_RETURN_TRUE;
}
Py_RETURN_FALSE;
}

static struct PyMethodDef cinder_module_methods[] = {
{"debug_break",
cinder_debug_break,
Expand Down Expand Up @@ -779,6 +793,14 @@ static struct PyMethodDef cinder_module_methods[] = {
get_awaiter_frame,
METH_NOARGS,
"Get the awaiter frame of the current executing task"},
{"_compile_perf_trampoline_pre_fork",
compile_perf_trampoline_pre_fork,
METH_NOARGS,
"Compile perf-trampoline entries before forking"},
{"_is_compile_perf_trampoline_pre_fork_enabled",
is_compile_perf_trampoline_pre_fork_enabled,
METH_NOARGS,
"Return whether compile perf-trampoline entries before fork is enabled or not"},
{NULL, NULL} /* sentinel */
};

Expand Down
48 changes: 43 additions & 5 deletions Objects/perf_trampoline.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ any DWARF information available for them).
#include "pycore_ceval.h"
#include "frameobject.h"
#include "pycore_interp.h"
#include "Jit/pyjit.h"

typedef enum {
PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state
Expand Down Expand Up @@ -180,6 +181,7 @@ static perf_status_t perf_status = PERF_STATUS_NO_INIT;
static Py_ssize_t extra_code_index = -1;
static code_arena_t *code_arena;
static trampoline_api_t trampoline_api;
static Py_ssize_t persist_after_fork = 0;

static void
perf_map_write_entry(void *state, const void *code_addr,
Expand Down Expand Up @@ -301,6 +303,24 @@ compile_trampoline(void)
return code_arena_new_code(code_arena);
}

int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
{
py_trampoline f = NULL;
assert(extra_code_index != -1);
int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
if (ret != 0 || f == NULL) {
py_trampoline new_trampoline = compile_trampoline();
if (new_trampoline == NULL) {
return 0;
}
trampoline_api.write_state(trampoline_api.state, new_trampoline,
code_arena->code_size, co);
return _PyCode_SetExtra((PyObject *)co, extra_code_index,
(void *)new_trampoline);
}
return 0;
}

static PyObject *
py_trampoline_evaluator(PyThreadState *ts, PyFrameObject *frame,
int throw)
Expand Down Expand Up @@ -422,16 +442,34 @@ _PyPerfTrampoline_Fini(void)
return 0;
}

int
PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){
#ifdef PY_HAVE_PERF_TRAMPOLINE
persist_after_fork = enable;
return persist_after_fork;
#endif
return 0;
}

PyStatus
_PyPerfTrampoline_AfterFork_Child(void)
{
#ifdef PY_HAVE_PERF_TRAMPOLINE
// Restart trampoline in file in child.
int was_active = _PyIsPerfTrampolineActive();
_PyPerfTrampoline_Fini();
PyUnstable_PerfMapState_Fini();
if (was_active) {
_PyPerfTrampoline_Init(1);
if (persist_after_fork){
char filename[256];
pid_t parent_pid = getppid();
snprintf(filename, sizeof(filename), "/tmp/perf-%d.map", parent_pid);
if(PyUnstable_CopyPerfMapFile(filename) != 0){
return PyStatus_Error("Failed to copy perf map file.");
}
} else {
// Restart trampoline in file in child.
int was_active = _PyIsPerfTrampolineActive();
_PyPerfTrampoline_Fini();
if (was_active) {
_PyPerfTrampoline_Init(1);
}
}
#endif
return PyStatus_Ok();
Expand Down
Loading

0 comments on commit 8fa7e0d

Please sign in to comment.