Skip to content

Commit

Permalink
Support task tracer
Browse files Browse the repository at this point in the history
  • Loading branch information
chenBright committed Dec 16, 2024
1 parent 282bc90 commit c44cb27
Show file tree
Hide file tree
Showing 29 changed files with 1,209 additions and 141 deletions.
13 changes: 13 additions & 0 deletions .github/actions/init-ut-make-config/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
inputs:
options:
description: extra options for config_brpc.sh
required: false
runs:
using: "composite"
steps:
- run: sudo git clone https://github.com/libunwind/libunwind.git && cd libunwind && sudo git checkout tags/v1.8.1 && sudo mkdir -p /libunwind && sudo autoreconf -i && sudo CC=clang CXX=clang++ ./configure --prefix=/libunwind && sudo make -j ${{env.proc_num}} && sudo make install
shell: bash
- run: sudo apt-get update && sudo apt-get install -y libgtest-dev cmake gdb libstdc++6-9-dbg && cd /usr/src/gtest && sudo cmake . && sudo make -j ${{env.proc_num}} && sudo mv lib/libgtest* /usr/lib/
shell: bash
- run: sh config_brpc.sh --headers="/libunwind/include /usr/include" --libs="/libunwind/lib /usr/lib /usr/lib64" --nodebugsymbols ${{inputs.options}}
shell: bash
2 changes: 1 addition & 1 deletion .github/actions/install-all-dependences/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ runs:
using: "composite"
steps:
- uses: ./.github/actions/install-essential-dependences
- run: sudo apt-get install -y libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev
- run: sudo apt-get install -y libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev libunwind8-dev
shell: bash
- run: wget https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf thrift-0.11.0.tar.gz
shell: bash
Expand Down
46 changes: 21 additions & 25 deletions .github/workflows/ci-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: ./.github/actions/install-essential-dependences
- name: cmake
- name: cmake
run: |
export CC=gcc && export CXX=g++
mkdir build
Expand All @@ -47,7 +47,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- run: bazel test --verbose_failures -- //... -//example/...

gcc-compile-with-boringssl:
runs-on: ubuntu-20.04
steps:
Expand All @@ -61,7 +61,7 @@ jobs:
- uses: ./.github/actions/install-all-dependences
- uses: ./.github/actions/init-make-config
with:
options: --cc=gcc --cxx=g++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock
options: --cc=gcc --cxx=g++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock --with-bthread-tracer
- name: compile
run: |
make -j ${{env.proc_num}}
Expand All @@ -76,7 +76,7 @@ jobs:
export CC=gcc && export CXX=g++
mkdir build
cd build
cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON ..
cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON -WITH_BTHREAD_TRACER=ON ..
- name: compile
run: |
cd build
Expand Down Expand Up @@ -105,7 +105,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: ./.github/actions/install-essential-dependences
- name: cmake
- name: cmake
run: |
export CC=clang && export CXX=clang++
mkdir build
Expand Down Expand Up @@ -135,7 +135,7 @@ jobs:
- uses: ./.github/actions/install-all-dependences
- uses: ./.github/actions/init-make-config
with:
options: --cc=clang --cxx=clang++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock
options: --cc=clang --cxx=clang++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock --with-bthread-tracer
- name: compile
run: |
make -j ${{env.proc_num}}
Expand All @@ -150,7 +150,7 @@ jobs:
export CC=clang && export CXX=clang++
mkdir build
cd build
cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON ..
cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON -WITH_BTHREAD_TRACER=ON ..
- name: compile
run: |
cd build
Expand All @@ -165,21 +165,17 @@ jobs:
clang-unittest:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: ./.github/actions/install-essential-dependences
- name: install gtest
run: |
sudo apt-get update
sudo apt-get install -y cmake libgtest-dev gdb
cd /usr/src/gtest && sudo cmake . && sudo make && sudo mv lib/libgtest* /usr/lib/
- uses: ./.github/actions/init-make-config
with:
options: --cc=clang --cxx=clang++
- name: compile tests
run: |
cd test
make -j ${{env.proc_num}}
- name: run tests
run: |
cd test
sh ./run_tests.sh
- uses: actions/checkout@v2
- uses: ./.github/actions/install-essential-dependences
- uses: ./.github/actions/init-ut-make-config
with:
options: --cc=clang --cxx=clang++ --with-bthread-tracer
- name: compile tests
run: |
cat config.mk
cd test
make -j ${{env.proc_num}}
- name: run tests
run: |
cd test
sh ./run_tests.sh
20 changes: 20 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ option(WITH_BORINGSSL "With BoringSSL" OFF)
option(DEBUG "Print debug logs" OFF)
option(WITH_DEBUG_SYMBOLS "With debug symbols" ON)
option(WITH_THRIFT "With thrift framed protocol supported" OFF)
option(WITH_BTHREAD_TRACER "With bthread tracer supported" OFF)
option(WITH_SNAPPY "With snappy" OFF)
option(WITH_RDMA "With RDMA" OFF)
option(WITH_DEBUG_BTHREAD_SCHE_SAFETY "With debugging bthread sche safety" OFF)
Expand Down Expand Up @@ -81,6 +82,20 @@ if(WITH_THRIFT)
endif()
endif()

if (WITH_BTHREAD_TRACER)
if (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux") OR NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64"))
message(FATAL_ERROR "bthread tracer is only supported on Linux x86_64 platform")
endif()
find_path(LIBUNWIND_INCLUDE_PATH NAMES libunwind.h)
find_library(LIBUNWIND_LIB NAMES unwind)
find_library(LIBUNWIND_X86_64_LIB NAMES unwind-x86_64)
if (NOT LIBUNWIND_INCLUDE_PATH OR NOT LIBUNWIND_LIB)
message(FATAL_ERROR "Fail to find libunwind, which is needed by bthread tracer")
endif()
add_definitions(-DBRPC_BTHREAD_TRACER)
include_directories(${LIBUNWIND_INCLUDE_PATH})
endif ()

set(WITH_RDMA_VAL "0")
if(WITH_RDMA)
set(WITH_RDMA_VAL "1")
Expand Down Expand Up @@ -319,6 +334,11 @@ if(WITH_SNAPPY)
set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lsnappy")
endif()

if (WITH_BTHREAD_TRACER)
set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB})
set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64")
endif()

if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(DYNAMIC_LIB ${DYNAMIC_LIB} rt)
set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lrt")
Expand Down
29 changes: 25 additions & 4 deletions config_brpc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ else
LDD=ldd
fi

TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-mesalink,with-debug-bthread-sche-safety,with-debug-lock,nodebugsymbols -n 'config_brpc' -- "$@"`
TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-mesalink,with-bthread-tracer,with-debug-bthread-sche-safety,with-debug-lock,nodebugsymbols -n 'config_brpc' -- "$@"`
WITH_GLOG=0
WITH_THRIFT=0
WITH_RDMA=0
WITH_MESALINK=0
WITH_BTHREAD_TRACER=0
BRPC_DEBUG_BTHREAD_SCHE_SAFETY=0
DEBUGSYMBOLS=-g
BRPC_DEBUG_LOCK=0
Expand All @@ -69,6 +70,7 @@ while true; do
--with-thrift) WITH_THRIFT=1; shift 1 ;;
--with-rdma) WITH_RDMA=1; shift 1 ;;
--with-mesalink) WITH_MESALINK=1; shift 1 ;;
--with-bthread-tracer) WITH_BTHREAD_TRACER=1; shift 1 ;;
--with-debug-bthread-sche-safety ) BRPC_DEBUG_BTHREAD_SCHE_SAFETY=1; shift 1 ;;
--with-debug-lock ) BRPC_DEBUG_LOCK=1; shift 1 ;;
--nodebugsymbols ) DEBUGSYMBOLS=; shift 1 ;;
Expand Down Expand Up @@ -352,8 +354,27 @@ fi

LEVELDB_HDR=$(find_dir_of_header_or_die leveldb/db.h)

HDRS=$($ECHO "$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR" | sort | uniq)
LIBS=$($ECHO "$GFLAGS_LIB\n$PROTOBUF_LIB\n$ABSL_LIB\n$LEVELDB_LIB\n$OPENSSL_LIB\n$SNAPPY_LIB" | sort | uniq)
CPPFLAGS=

if [ $WITH_BTHREAD_TRACER != 0 ]; then
if [ "$SYSTEM" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
>&2 $ECHO "bthread tracer is only supported on Linux x86_64 platform"
exit 1
fi
LIBUNWIND_HDR=$(find_dir_of_header_or_die libunwind.h)
LIBUNWIND_LIB=$(find_dir_of_lib_or_die unwind)

CPPFLAGS="${CPPFLAGS} -DBRPC_BTHREAD_TRACER"

if [ -f "$LIBUNWIND_LIB/libunwind.$SO" ]; then
DYNAMIC_LINKINGS="$DYNAMIC_LINKINGS -lunwind -lunwind-x86_64"
else
STATIC_LINKINGS="$STATIC_LINKINGS -lunwind -lunwind-x86_64"
fi
fi

HDRS=$($ECHO "$LIBUNWIND_HDR\n$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR" | sort | uniq)
LIBS=$($ECHO "$LIBUNWIND_LIB\n$GFLAGS_LIB\n$PROTOBUF_LIB\n$ABSL_LIB\n$LEVELDB_LIB\n$OPENSSL_LIB\n$SNAPPY_LIB" | sort | uniq)

absent_in_the_list() {
TMP=`$ECHO "$1\n$2" | sort | uniq`
Expand Down Expand Up @@ -411,7 +432,7 @@ append_to_output "STATIC_LINKINGS=$STATIC_LINKINGS"
append_to_output "DYNAMIC_LINKINGS=$DYNAMIC_LINKINGS"

# CPP means C PreProcessing, not C PlusPlus
CPPFLAGS="-DBRPC_WITH_GLOG=$WITH_GLOG -DGFLAGS_NS=$GFLAGS_NS -DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=$BRPC_DEBUG_BTHREAD_SCHE_SAFETY -DBRPC_DEBUG_LOCK=$BRPC_DEBUG_LOCK"
CPPFLAGS="${CPPFLAGS} -DBRPC_WITH_GLOG=$WITH_GLOG -DGFLAGS_NS=$GFLAGS_NS -DBRPC_DEBUG_BTHREAD_SCHE_SAFETY=$BRPC_DEBUG_BTHREAD_SCHE_SAFETY -DBRPC_DEBUG_LOCK=$BRPC_DEBUG_LOCK"

# Avoid over-optimizations of TLS variables by GCC>=4.8
# See: https://github.com/apache/brpc/issues/1693
Expand Down
69 changes: 69 additions & 0 deletions docs/cn/bthread_tracer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
gdb(ptrace)+ gdb_bthread_stack.py主要的缺点是要慢和阻塞进程,需要一种高效的追踪bthread调用栈的方法。

bRPC框架的协作式用户态协程无法像Golang内建的抢占式协程一样实现高效的STW(Stop the World),框架也无法干预用户逻辑的执行,所以要追踪bthread调用栈是比较困难的。

在线追踪bthread调用栈需要解决以下问题:
1. 追踪挂起bthread的调用栈。
2. 追踪运行中bthread的调用栈。

# bthread状态模型

以下是目前的bthread状态模型。

![bthread状态模型](../images/bthread_status_model.svg)

# 设计方案

## 核心思路

为了解决上述两个问题,该方案实现了STB(Stop The Bthread),核心思路可以简单总结为,在追踪bthread调用栈的过程中,状态不能流转到当前追踪方法不支持的状态。STB包含了两种追踪模式:上下文(context)追踪模式和信号追踪模式。

### 上下文(context)追踪模式
上下文追踪模式可以追踪挂起bthread的调用栈。挂起的bthread栈是稳定的,利用TaskMeta.stack中保存的上下文信息(x86_64下关键的寄存器主要是RIP、RSP、RBP),通过一些可以回溯指定上下文调用栈的库来追踪bthread调用栈。但是挂起的bthread随时可能会被唤醒,执行逻辑(包括jump_stack),则bthread栈会一直变化。不稳定的上下文是不能用来追踪调用栈的,需要在jump_stack前拦截bthread的调度,等到调用栈追踪完成后才继续运行bthread。所以,上下文追踪模式支持就绪、挂起这两个状态。

### 信号追踪模式

信号追踪模式可以追踪运行中bthread的调用栈。运行中bthread是不稳定的,不能使用TaskMeta.stack来追踪bthread调用栈。只能另辟蹊径,使用信号中断bthread运行逻辑,在信号处理函数中回溯bthread调用栈。使用信号有两个问题:

1. 异步信号安全问题。
2. 信号追踪模式不支持jump_stack。调用栈回溯需要寄存器信息,但jump_stack会操作寄存器,这个过程是不安全的,所以jump_stack不能被信号中断,需要在jump_stack前拦截bthread的调度,等到bthread调用栈追踪完成后才继续挂起bthread。

所以,追踪模式只支持运行状态。

### 小结

jump_stack是bthread挂起或者运行的必经之路,也是STB的拦截点。STB将状态分成三类:
1. 上下文追踪模式的状态:就绪、挂起。
2. 支持信号追踪模式的状态:运行。
3. 不支持追踪的状态。jump_stack的过程是不允许使用以上两种调用栈追踪方法,需要在jump_stack前拦截bthread的调度,等到调用栈追踪完成后才继续调度bthread。

### 详细流程

以下是引入STB后的bthread状态模型,在原来bthread状态模型的基础上,加入两个状态(拦截点):将运行、挂起中。

![bthread STB状态模型](../images/bthread_stb_model.svg)

经过上述分析,总结出STB的流程:

1. TaskTracer(实现STB的一个模块)收到追踪bthread调用栈的请求时,标识正在追踪。追踪完成后,标识追踪完成,并TaskTracer发信号通知可能处于将运行或者挂起中状态的bthread。根据bthread状态,TaskTracer执行不同的逻辑:
- 创建、就绪但还没分配栈、销毁:直接结束追踪。
- 挂起、就绪:使用上下文追踪模式追踪bthread的调用栈。
- 运行:使用信号追踪模式追踪bthread的调用栈。
- 将运行、挂起中:TaskTracer自旋等到bthread状态流转到下一个状态(挂起或者运行)后继续追踪。

2. TaskTracer追踪时,bthread根据状态也会执行不同的逻辑:
- 创建、就绪但还没分配栈、就绪:不需要额外处理。
- 挂起、运行:通知TaskTracer继续追踪。
- 将运行、挂起中、销毁:bthread通过条件变量等到TaskTracer追踪完成。TaskTracer追踪完成后会通过条件变量通知bthread继续执行jump_stack。

# 使用方法

1. 下载安装libunwind。
2. 给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项。
3. 访问服务的内置服务:`http://ip:port/bthreads/<bthread_id>?st=1`或者代码里调用`bthread::stack_trace()`函数。
4. 如果希望追踪pthread的调用栈,在对应pthread上调用`bthread::init_for_pthread_stack_trace()`函数获取一个伪bthread_t,然后使用步骤3即可获取pthread调用栈。

# 相关flag

- `enable_fast_unwind`:是否启用快速回溯功能,默认为true。大多数情况下,不需要关闭快速回溯功能。除非你关注的调用栈函数名转换失败,显示为`<unknown>`,则可以尝试关闭快速回溯功能,但这会导致性能下降。以包含30帧的调用栈举例,快速回溯只需要400~500us,而关闭快速回溯则需要4ms左右,性能下降了近10倍。
- `signal_trace_timeout_ms`:信号追踪模式的超时时间,默认为50ms。虽然libunwind文档显示回溯功能是异步信号安全的,但是[gpertools社区发现libunwind在某些情况下会死锁](https://github.com/gperftools/gperftools/issues/775),所以TaskTracer会设置了超时时间,超时后会放弃回溯,打破死锁。
8 changes: 7 additions & 1 deletion docs/cn/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ brpc默认**不**链接 [tcmalloc](http://goog-perftools.sourceforge.net/doc/tcm

## glog: 3.3+

brpc实现了一个默认的[日志功能](../../src/butil/logging.h)它和glog冲突。要替换成glog,可以给config_brpc.sh增加*--with-glog*选项或者给cmake增加`-DWITH_GLOG=ON`选项。
brpc实现了一个默认的[日志功能](../../src/butil/logging.h)它和glog冲突。要替换成glog,可以给config_brpc.sh增加`--with-glog`选项或者给cmake增加`-DWITH_GLOG=ON`选项。

## valgrind: 3.8+

Expand All @@ -392,6 +392,12 @@ brpc会自动检测valgrind(然后注册bthread的栈)。不支持老版本

无已知问题。

## libunwind: 1.3-1.8.1

bRPC默认****链接 [libunwind](https://github.com/libunwind/libunwind)。用户需要追踪bthread功能则链接libunwind,可以给config_brpc.sh增加`--with-bthread-tracer`选项或者给cmake增加`-DWITH_BTHREAD_TRACER=ON`选项。

建议使用最新版本的libunwind。

# 实例追踪

我们提供了一个程序去帮助你追踪和监控所有brpc实例。 只需要在某处运行 [trackme_server](https://github.com/apache/brpc/tree/master/tools/trackme_server/) 然后再带着 -trackme_server=SERVER参数启动需要被追踪的实例。trackme_server将从实例周期性地收到ping消息然后打印日志。您可以从日志中聚合实例地址,并调用实例的内置服务以获取更多信息。
8 changes: 7 additions & 1 deletion docs/en/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,20 @@ When you remove tcmalloc, not only remove the linkage with tcmalloc but also the

## glog: 3.3+

brpc implements a default [logging utility](../../src/butil/logging.h) which conflicts with glog. To replace this with glog, add *--with-glog* to config_brpc.sh or add `-DWITH_GLOG=ON` to cmake.
brpc implements a default [logging utility](../../src/butil/logging.h) which conflicts with glog. To replace this with glog, add `--with-glog` to config_brpc.sh or add `-DWITH_GLOG=ON` to cmake.

## valgrind: 3.8+

brpc detects valgrind automatically (and registers stacks of bthread). Older valgrind(say 3.2) is not supported.

## thrift: 0.9.3-0.11.0

## libunwind: 1.3-1.8.1

brpc does **not** link [libunwind](https://github.com/libunwind/libunwind) by default. Users link libunwind on-demand by adding `--with-glog` to config_brpc.sh or adding `-DWITH_GLOG=ON` to cmake.

It is recommended to use the latest possible version of libunwind.

no known issues.

# Track instances
Expand Down
3 changes: 3 additions & 0 deletions docs/images/bthread_status_model.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions docs/images/bthread_stb_model.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 14 additions & 2 deletions src/brpc/builtin/bthreads_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.


#include <ostream>
#include "bthread/bthread.h"
#include "brpc/closure_guard.h" // ClosureGuard
#include "brpc/controller.h" // Controller
#include "brpc/builtin/common.h"
Expand All @@ -40,12 +40,24 @@ void BthreadsService::default_method(::google::protobuf::RpcController* cntl_bas
const std::string& constraint = cntl->http_request().unresolved_path();

if (constraint.empty()) {
#ifdef BRPC_BTHREAD_TRACER
os << "Use /bthreads/<bthread_id> or /bthreads/<bthread_id>?st=1 for stack trace";
#else
os << "Use /bthreads/<bthread_id>";
#endif // BRPC_BTHREAD_TRACER
} else {
char* endptr = NULL;
bthread_t tid = strtoull(constraint.c_str(), &endptr, 10);
if (*endptr == '\0' || *endptr == '/') {
if (*endptr == '\0' || *endptr == '/' || *endptr == '?') {
::bthread::print_task(os, tid);

#ifdef BRPC_BTHREAD_TRACER
const std::string* st = cntl->http_request().uri().GetQuery("st");
if (NULL != st && *st == "1") {
os << "\nbthread call stack:\n";
::bthread::stack_trace(os, tid);
}
#endif // BRPC_BTHREAD_TRACER
} else {
cntl->SetFailed(ENOMETHOD, "path=%s is not a bthread id",
constraint.c_str());
Expand Down
Loading

0 comments on commit c44cb27

Please sign in to comment.